ms-20240410.cu
An execution example on GeForce RTX-4090
$ nvcc -O3 -arch=sm_60 -maxrregcount=40 ms-20240410.cu -lcrypto $ time ./a.out 00000 e00000007 13745 180800250 ms-20240410.cu Size of long long unsigned : 8 topbit() works correctly. nvecs 32134 size of p_idx_c 2300 DevId: 0 #SM:128, #Blocks:2048 NVIDIA GeForce RTX 4090 Total warps: 6144 r1:e00000007 r2:180800250 nrows 3116 n3rd 711 row_set.size() 5956644, 242 sets/warp 00000 e00000007 13745 180800250 247438137089 824ce60dca0861defc383ee53d3367e7 00000 e00000007 13745 180800250 S 286856560469 65e40aa25a4edf87381bb51748779337 real 1m56.533s user 1m56.274s sys 0m0.256s
- ms-20240410.cu
/* ms.cu Counting number of magic squares up to rotations, reflections and M tranformations. Simultaneously counting semi-magic squares. Compilation : nvcc -O3 -arch=sm_60 -maxrregcount=40 \ -Wno-deprecated-declarations ms.cu -lcrypto To omit md5 checksum, add a -DnoMD5 option. Then you may drop -Wnodeprecated-declarations and -lcypto. -DV option produces verbose outputs for N=6. Command line options: % ./a.out <1stRowNo> <1stRowPattern> <2ndRowNo> <2ndRowPattern> example : ./a.out 00000 e00000007 13745 180800250 See also : https://magicsquare6.net/ Updated on 2024/04/10. Updated on 2023/11/28: Critical __syncwarp() added in LastCols(). Authored by Hidetoshi Mino hidetoshimino@gmail.com . */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <assert.h> #ifndef noMD5 #include <openssl/md5.h> #endif #include <vector> using namespace std; #ifndef N // order of square : 3..6 #define N 6 #endif #ifndef NB_SM // Number of blocks per SM #define NB_SM 16 #endif #ifndef NW // Number of Warps per block #define NW 3 #endif #ifndef NSMPL #define NSMPL 1 #endif #ifndef MDIAG #define MDIAG 24 #endif #ifndef CINTLIM #define CINTLIM 26 #endif typedef long long unsigned u64; #define NN (N*N) #define AllBits ( ( 1ULL << NN ) - 1 ) #define AllLanes (0xffffffff) #define MAX( x, y ) ( (x) > (y) ? (x) : (y) ) #define CU_SAFE(stmt) \ do { \ cudaError_t status = stmt; \ if (status != cudaSuccess) { \ printf("ERROR: %s in %s:%d.\n", cudaGetErrorString(status), \ __FILE__, __LINE__); \ exit( 1 ); \ } \ } while (0) #define WS 32 /* Warp size */ #define RUWS( x ) ( ( ( x ) + ( WS - 1 ) ) & ~( WS - 1 ) ) /* Round up to WS */ #define RDWS( x ) ( ( x ) & ~( WS - 1 ) ) /* Round down to WS */ __device__ __host__ int topbit( u64 v ) { #ifdef __CUDA_ARCH__ return 63 - __clzll( v ); #else v = v & ( ~( v >> 1 ) ); float f32 = (float) v; int pos = *( (int *) &f32 ); pos =( pos >> 23 ) - 0x7f; return pos; #endif /* int pos = 0; int v32 = v >> 32; if( v32 ) { pos = 32; } else { v32 = (int) v; } if( v32 & 0xffff0000UL ) { pos +=16; v32 >>= 16; } if( v32 & 0x0000ff00UL ) { pos += 8; v32 >>= 8; } if( v32 & 0x000000f0UL ) { pos += 4; v32 >>= 4; } if( v32 & 0x0000000cUL ) { pos += 2; v32 >>= 2; } if( v32 & 0000000002UL ) pos++; return pos; */ } __device__ __host__ u64 reverse64( u64 x ) { #ifdef __CUDA_ARCH__ return __brevll( x ); #else u64 y; y = ( ( x & 0x5555555555555555ULL ) << 1 ); x = ( y | ( ( x >> 1 ) & 0x5555555555555555ULL ) ); y = ( ( x & 0x3333333333333333ULL ) << 2 ); x = ( y | ( ( x >> 2 ) & 0x3333333333333333ULL ) ); y = ( ( x & 0x0F0F0F0F0F0F0F0FULL ) << 4 ); x = ( y | ( ( x >> 4 ) & 0x0F0F0F0F0F0F0F0FULL ) ); y = ( ( x & 0x00FF00FF00FF00FFULL ) << 8 ); x = ( y | ( ( x >> 8 ) & 0x00FF00FF00FF00FFULL ) ); y = ( ( x & 0x0000FFFF0000FFFFULL ) << 16 ); x = ( y | ( ( x >> 16 ) & 0x0000FFFF0000FFFFULL ) ); y = ( ( x & 0x00000000FFFFFFFFULL ) << 32 ); x = ( y | ( ( x >> 32 ) & 0x00000000FFFFFFFFULL ) ); return x; #endif } int check_topbit( ) { for( int i = 0; i < NN; i ++ ) { // printf( "%d %09llx %d\n", i, 1ULL << i, topbit( 1ULL << i ) ); if( topbit( 1ULL << i ) != i ) return 1; } for( int i = 1; i < NN; i ++ ) { // printf( "%d %09llx %d\n" // , i, ( 1ULL << i ) - 1, topbit( ( 1ULL << i ) - 1 ) ); if( topbit( ( 1ULL << i ) - 1 ) != i - 1 ) return 1; } fprintf( stderr, "topbit() works correctly.\n" ); return 0; } __device__ __host__ int vcsize_f( int ncols, int order ) { return RUWS( ncols ) * ( order - 2 ) + WS; } __device__ __host__ int vxsize_f( int ndiags, int order ) { return RUWS( ndiags ) * ( order - 1 ) + WS; } /* = Host globals ======================= */ #define MVECS 32768 int nvecs = 0; u64 vecs[ MVECS ]; int totalwarps; /* = Constant memory ==================== */ __constant__ u64 v_max_c; __constant__ unsigned v_max32_c; __constant__ u64 rev_max_c; __constant__ unsigned rev_max32_c; __constant__ u64 v_second_c; __constant__ bool single_c; __constant__ int nrowset_c; __constant__ int ncols_c, ndiags_c; __constant__ int vcsize_c, vxsize_c; __constant__ int nbatch_c; __constant__ int totalwarps_c; #define MDPAIR ( MDIAG * MDIAG * MDIAG / 5 + ( WS - 1 ) * MDIAG ) __constant__ int p_offset_c[ MDIAG + 2 ]; __constant__ int p_idx_c[ MDPAIR ]; /* = Block __shared__ ==================== */ __shared__ int which_row[ NW ][ NN ]; __shared__ u64 which_colv[ NW ][ NN ]; /* = Managed ( Unified ) =================== */ __managed__ int i_rowset_m = 0; __managed__ u64 count_m = 0ULL; __managed__ u64 sm_count_m = 0ULL; /* ========================================= */ void Count( int max_row, int second_row, const vector<int> & dev_blocks ); void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res ); __global__ void RowSet( u64* row_set_d, u64* vc_g, u64* vx_g ); __device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, u64* __restrict__ const & vxb, const int & nvx, const bool & single, int & sm_count ); __device__ void LastCols( unsigned rem, const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, u64* __restrict__ const & vxb, const int & nvx, const bool single, u64 & sub_count, int & sm_count ); __device__ int MakeDiagC( const u64* __restrict__ const & vxb, const int & nvx ); __device__ int MakeDiagS( const u64* __restrict__ const & vxb, const int & nvx ); __device__ bool FinalCheck( const u64 & vdx, const u64 & vcx ); int Cross( int v1, int v2 ) { u64 v = vecs[v1] & vecs[v2]; return v && ( ! ( v & ( v - 1 ) ) ); } __device__ __host__ u64 Reverse( u64 v ) { return reverse64( v ) >> (64 - NN); } void GenVecs( int max_ent, int rem_sum, int level, u64 vp ) { if ( level >= 0 ) { for ( int ent = max_ent; ent > 0 ; --ent ) { u64 v = 1ULL << ( ent - 1 ); if ( rem_sum >= ent ) GenVecs( ent - 1, rem_sum - ent, level - 1, vp | v ); } } else { if ( rem_sum == 0 ) { vecs[ nvecs ++ ] = vp; } } if( nvecs > MVECS ) { fprintf( stderr, "nvecs > MVECS\n" ); exit(1); } } void Count( int max_row, int second_row, const vector<int> & dev_blocks ) { u64 v_max = vecs[ max_row ]; unsigned v_max32 = ( NN > 32 ) ? v_max >> ( NN - 32 ) : v_max << ( 32 - NN ); u64 v_second = vecs[ second_row ]; u64 rem = AllBits ^ v_max ^ v_second; u64 must_bit = ( 1ULL << topbit( rem ) ); vector<u64> rows; rows.reserve( nvecs ); vector<u64> cols; cols.reserve( nvecs ); vector<u64> diags; diags.reserve( nvecs ); int n3rd = 0; for( int i = second_row + 1; i < nvecs; ++ i ) if( ( ! ( v_max & vecs[ i ] ) ) && ( ! ( v_second & vecs[ i ] ) ) && v_max >= Reverse( vecs[ i ] ) ) { rows.push_back( vecs[ i ] ); if( vecs[ i ] & must_bit ) n3rd ++; } int nrows = rows.size(); vector<u64> row_set; row_set.reserve( n3rd * 3000 * ( N - 2 ) ); u64 vr[ nrows * ( N - 2 ) ]; for( int i = 0; i < nrows; i ++ ) vr[ i ] = rows[ i ]; fprintf( stderr, "nrows %d n3rd %d \n", nrows, n3rd ); MakeRows( N - 2, 0, rem, vr, vr + nrows, row_set ); fprintf( stderr, "row_set.size() %7lu, %5lu sets/warp \n", row_set.size(), row_set.size() / ( N - 2 ) / totalwarps ); int nbatch_ini = max( row_set.size() / ( N - 2 ) / totalwarps / 3, 1ul ); for( int i = max_row + 1; i < nvecs; ++ i ) if( Cross( max_row, i ) && Cross( second_row, i ) && v_max >= Reverse( vecs[ i ] ) ) { cols.push_back( vecs[ i ] ); } int ncols = cols.size(); for( int i = 0; i < nvecs; ++ i ) if( Cross( max_row, i ) && Cross( second_row, i ) ) diags.push_back( vecs[ i ] ); int ndiags = diags.size(); /* constant parameters for the __global__ RowSet() */ u64 rev_max = Reverse( v_max ); unsigned rev_max32 = unsigned( rev_max ); bool single = rev_max == v_max or rev_max == v_second; int nrowset = row_set.size(); int vcsize = vcsize_f( ncols, N ); int vxsize = vxsize_f( ndiags, N ); // Device Memory Allocation ------------------- int ndev = dev_blocks.size(); u64* row_set_d[ ndev ]; u64* vc_d[ ndev ]; u64* vx_d[ ndev ]; i_rowset_m = 0; // initialize managed variable // count_m = 0ULL; // Don't initialize here. for( int idev = 0 ; idev < ndev; idev ++ ) { CU_SAFE( cudaSetDevice( idev ) ); int nblocks = dev_blocks[ idev ]; CU_SAFE( cudaMalloc( row_set_d + idev, sizeof( u64 ) * row_set.size() ) ); CU_SAFE( cudaMemcpy( row_set_d[ idev ], row_set.data(), sizeof( u64 ) * row_set.size(), cudaMemcpyHostToDevice ) ); CU_SAFE( cudaMalloc( vc_d + idev, sizeof(u64) * ( RUWS( ncols ) + vcsize * nblocks * NW ) ) ); CU_SAFE( cudaMalloc( vx_d + idev, sizeof(u64) * ( RUWS( ndiags ) + vxsize * nblocks * NW ) ) ); CU_SAFE( cudaMemcpy( vc_d[ idev ], cols.data(), ncols*sizeof(u64), cudaMemcpyHostToDevice ) ); CU_SAFE( cudaMemcpy( vx_d[ idev ], diags.data(), ndiags*sizeof(u64), cudaMemcpyHostToDevice ) ); // constant memory setup -------------------- CU_SAFE( cudaMemcpyToSymbol( v_max_c, &v_max, sizeof( u64 ) ) ); CU_SAFE( cudaMemcpyToSymbol( v_max32_c, &v_max32, sizeof(unsigned) ) ); CU_SAFE( cudaMemcpyToSymbol( v_second_c, &v_second, sizeof( u64 ) ) ); CU_SAFE( cudaMemcpyToSymbol( rev_max_c, &rev_max, sizeof( u64 ) ) ); CU_SAFE( cudaMemcpyToSymbol( rev_max32_c, &rev_max32, sizeof(unsigned) ) ); CU_SAFE( cudaMemcpyToSymbol( single_c, &single, sizeof( bool ) ) ); CU_SAFE( cudaMemcpyToSymbol( nrowset_c, &nrowset, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( ncols_c, &ncols, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( ndiags_c, &ndiags, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( vcsize_c, &vcsize, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( vxsize_c, &vxsize, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( nbatch_c, &nbatch_ini, sizeof( int ) ) ); CU_SAFE( cudaMemcpyToSymbol( totalwarps_c, &totalwarps, sizeof( int ) ) ); // ------------------------------------------ RowSet <<< nblocks, dim3( WS, NW ) >>> ( row_set_d[ idev ], vc_d[ idev ], vx_d[ idev ] ); } for( int idev = 0 ; idev < ndev; idev ++ ) { CU_SAFE( cudaSetDevice( idev ) ); // cudaDeviceSynchronize(); // Device Memory Deallocation ---------------- CU_SAFE( cudaFree( vx_d[ idev ] ) ); CU_SAFE( cudaFree( vc_d[ idev ] ) ); CU_SAFE( cudaFree( row_set_d[ idev ] ) ); } return; } void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res ) { static u64 row_vecs[ N ]; u64 must_bit = ( 1ULL << topbit( rem ) ); for( ; vrb < vre; vrb += NSMPL ) { u64 row_vec = *vrb; if( ! ( row_vec & must_bit ) ) break; row_vecs[ l ] = row_vec; if( m > l + 1 ) { u64* vrp = vrb + 1; u64* vrq = vre; while( vrp < vre ) { *vrq = *vrp; vrq += ( row_vec & *vrp ) ? 0 : 1; vrp ++; } MakeRows( m, l + 1, rem ^ row_vec, vre, vrq, res ); } else { for( int i = 0; i < m; i ++ ) { res.push_back( row_vecs[ i ] ); } } } } __device__ int gather_para( const u64* __restrict__ const vp, const int & bv, const int & ev, const u64 & x, u64* __restrict__ const & vpn, const int must_bit, int & nvn ) { int j = 0; nvn = 0; for( int i = /*RDWS*/(bv); i < ev; i += WS ) { u64 v = -1ULL; if( i + threadIdx.x < ev ) { v = vp[ i + threadIdx.x ]; } bool p = ! ( v & x ); bool nesp = p and ( v >> must_bit ); unsigned ballot = __ballot_sync( AllLanes, p ); int sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); nvn += __popc( __ballot_sync( AllLanes, nesp ) ); if( p ) { vpn[ j + sub_sum ] = v; } j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); } return j; } __device__ int cnext( const u64* __restrict__ vp, const int & nv, const int must_bit ) { int j = 0; for( int i = 0; i < nv; i += WS ) { bool p = false; if( i + threadIdx.x < nv ) { p = vp[ i + threadIdx.x ] >> must_bit; } j += __popc( __ballot_sync( AllLanes, p ) ); if( j != i + WS ) break; } return j; } __device__ int gather_cross( const u64* __restrict__ const & vp, const int & nv, const u64 & x, u64* __restrict__ const & vpn ) { int j = 0; for( int i = 0; i < nv; i += WS ) { u64 v = 0ULL; if( i + threadIdx.x < nv ) v = vp[ i + threadIdx.x ]; bool p = __popcll( v & x ) == 1; unsigned ballot = __ballot_sync( AllLanes, p ); int sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); if( p ) vpn[ j + sub_sum ] = v; j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); } return j; } __device__ int gather_double_cross( const u64* __restrict__ const & vp, const int & nv, const u64 & x1, const u64 x2, u64* __restrict__ const & vpn ) { int j = 0; for( int i = 0; i < nv; i += WS ) { u64 v = 0ULL; if( i + threadIdx.x < nv ) v = vp[ i + threadIdx.x ]; bool p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1; unsigned ballot = __ballot_sync( AllLanes, p ); int sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); if( p ) vpn[ j + sub_sum ] = v; j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); } return j; } __device__ int gather_triple_cross( const u64* __restrict__ const & vp, const int & nv, const u64 & x1, const u64 & x2, const u64 & x3, u64* __restrict__ const & vpn ) { int j = 0; #pragma unroll 1 for( int i = 0; i < nv; i += WS ) { int i_t = i + threadIdx.x; u64 v = 0ULL; if( i_t < nv ) v = vp[ i_t ]; bool p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1 and __popcll( v & x3 ) == 1; unsigned ballot = __ballot_sync( AllLanes, p ); int sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); if( p ) vpn[ j + sub_sum ] = v; j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); } return j; } __device__ void set_vecr( u64 vecs[ ], int index[ ], const u64 & v, const int & level ) { vecs[ level ] = v; if( ( v >> threadIdx.x ) & 1 ) { index[ NN - 1 - threadIdx.x ] = level; } if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) { index[ NN - 1 - ( WS + threadIdx.x ) ] = level; } __syncwarp(); } __device__ void set_vecc( u64 index[ ], const u64 & v ) { if( ( v >> threadIdx.x ) & 1 ) { index[ threadIdx.x ] = v; } if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) { index[ WS + threadIdx.x ] = v; } __syncwarp(); } __global__ void RowSet( u64* rowset, u64* vc_g, u64* vx_g ) { __shared__ u64 row_vecs[ NW ][ N ]; #define ROW_VECS row_vecs[ threadIdx.y ] #define WHICH_ROW which_row[ threadIdx.y ] set_vecr( ROW_VECS, WHICH_ROW, v_max_c, N - 1 ); set_vecr( ROW_VECS, WHICH_ROW, v_second_c, N - 2 ); __shared__ u64* vc_s[ NW ]; __shared__ u64* vx_s[ NW ]; #define VC vc_s[ threadIdx.y ] #define VX vx_s[ threadIdx.y ] VC = vc_g + RUWS( ncols_c ) + vcsize_c * ( blockIdx.x * NW + threadIdx.y ); VX = vx_g + RUWS( ndiags_c ) + vxsize_c * ( blockIdx.x * NW + threadIdx.y ); VC[ vcsize_c - WS ] = 0ULL; VX[ vxsize_c - WS ] = 0ULL; u64 count = 0ULL; u64 sm_count = 0ULL; __shared__ int nbatch_s[ NW ]; #define NBATCH nbatch_s[ threadIdx.y ] NBATCH = nbatch_c; for( ; /* i_rowset < nrowset_c */; /* i_rowset += N - 2 */ ) { __shared__ int i_rowbat[ NW ]; #define I_ROWBAT i_rowbat[ threadIdx.y ] if( threadIdx.x == 0 ) { I_ROWBAT = atomicAdd_system( &i_rowset_m, ( N - 2 ) * NBATCH ); } __syncwarp(); if( I_ROWBAT >= nrowset_c ) break; __shared__ int i_set[ NW ]; #define I_SET i_set[ threadIdx.y ] I_SET = 0; while( I_SET < NBATCH and I_ROWBAT + I_SET * ( N - 2 ) < nrowset_c ) { __shared__ bool single[ NW ]; #define SINGLE single[ threadIdx.y ] SINGLE = single_c; for( int i = 0; i < N - 2; i ++ ) { u64 row_vec = rowset[ I_ROWBAT + ( I_SET * ( N - 2 ) ) + i ]; set_vecr( ROW_VECS, WHICH_ROW, row_vec, N - 3 - i ); SINGLE = SINGLE or rev_max_c == row_vec; } __shared__ int kvc_s[ NW ]; __shared__ int kvx_s[ NW ]; #define KVC kvc_s[ threadIdx.y ] #define KVX kvx_s[ threadIdx.y ] #if N == 6 KVC = gather_triple_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VC ); KVX = gather_triple_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VX ); #endif #if N == 5 KVC = gather_double_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VC ); KVX = gather_double_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VX ); #endif #if N == 4 KVC = gather_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], VC ); KVX = gather_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], VX ); #endif #if N == 3 for( int i = 0; i < ncols_c; i++ ) VC[ i ] = vc_g[ i ]; for( int i = 0; i < ndiags_c; i++ ) VX[ i ] = vx_g[ i ]; KVC = ncols_c; KVX = ndiags_c; #endif for( int iw = 0; iw < KVX; iw += WS ) { if( iw + threadIdx.x < KVX ) { u64 v = VX[ iw + threadIdx.x ]; VX[ iw + threadIdx.x ] |= u64 ( ( 63 - __clzll ( v & ROW_VECS[ 0 ] ) ) | ( ( 63 - __clzll ( v & ROW_VECS[ 1 ] ) ) << 8 ) | ( ( 63 - __clzll ( v & ROW_VECS[ 2 ] ) ) << 16 ) ) << 40; } } __syncwarp(); #if N > 3 int sm_count_cols = 0; count += MakeCols( VC, KVC, VX, KVX, SINGLE, sm_count_cols ) ; assert( ( sm_count_cols >> CINTLIM ) == 0 ); sm_count += sm_count_cols; #else u64 sub_count = 0; int sm_count_cols = 0; int nvn = cnext( VC, KVC, NN - 1 ); LastCols( AllBits, VC, nvn, KVC - nvn, VX, KVX, SINGLE, sub_count, sm_count_cols ); count += sub_count; sm_count += sm_count_cols; #endif #if N > 5 #ifdef V if( blockIdx.x == 0 and threadIdx.x == 0 and threadIdx.y == 0 ) { printf( "r3:%09llx %9llu %9llu %5d\n", ROW_VECS[ N - 3 ], count, sm_count, NBATCH ); } #endif #endif if( threadIdx.x == 0) I_SET++; __syncwarp(); } if( threadIdx.x == 0 ) NBATCH = max( (nrowset_c - i_rowset_m)/(N - 2)/totalwarps_c/3, 1 ); __syncwarp(); } if ( VC[ vcsize_c - WS ] != 0ULL or VX[ vxsize_c - WS ] != 0ULL ) { printf( " Array over run ! \n"); assert( 0 ); // exit( 1 ); } for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) { count += __shfl_xor_sync( AllLanes, count, span ); } if( threadIdx.x == 0 ) { atomicAdd_system( &count_m, count ); atomicAdd_system( &sm_count_m, sm_count ); } } #if N > 3 __device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, u64* __restrict__ const & vxb, const int & nvx, const bool & single, int & sm_count) { __shared__ u64 rem_s[ NW ][ N - 3 ]; __shared__ u64* vcb_s[ NW ][ N - 3 ]; __shared__ int nvc_s[ NW ][ N - 3 ]; __shared__ int nvn_s[ NW ][ N - 3 ]; __shared__ u64* vxb_s[ NW ][ N - 3 ]; __shared__ int nvx_s[ NW ][ N - 3 ]; __shared__ bool single_s[ NW ][ N - 3 ]; #define REM_S rem_s[ threadIdx.y ] #define VCB_S vcb_s[ threadIdx.y ] #define NVC_S nvc_s[ threadIdx.y ] #define NVN_S nvn_s[ threadIdx.y ] #define VXB_S vxb_s[ threadIdx.y ] #define NVX_S nvx_s[ threadIdx.y ] #define SINGLE_S single_s[ threadIdx.y ] int lvl = N - 4; REM_S[ lvl ] = AllBits;//rem; VCB_S[ lvl ] = vcb; NVC_S[ lvl ] = nvc; NVN_S[ lvl ] = cnext( vcb, nvc, NN - 1 ); VXB_S[ lvl ] = vxb; NVX_S[ lvl ] = nvx; SINGLE_S[ lvl ] = single; __shared__ int ivc_s[ NW ][ N - 3 ]; #define IVC_S ivc_s[ threadIdx.y ] u64 sub_count = 0; // int sm_count = 0; Init: IVC_S[ lvl ] = 0; // for( ; ivc_s[ lvl ] < nvn_s[ lvl ]; ivc_s[ lvl ] ++ ) { Loop: if( IVC_S[ lvl ] >= NVN_S[ lvl ] ) { // Recursive return if( lvl == N - 4 ) return sub_count; else { lvl ++; goto ReInit; } } { // Scope limit for locals. __shared__ u64 col_vec[ NW ]; #define COL_VEC col_vec[ threadIdx.y ] COL_VEC = VCB_S[ lvl ][ IVC_S[ lvl ] ]; __shared__ u64* vxbn[ NW ]; #define VXBN vxbn[ threadIdx.y ] VXBN = RUWS( NVX_S[ lvl ] ) + VXB_S[ lvl ]; __shared__ int kvx[ NW ]; #undef KVX #define KVX kvx[ threadIdx.y ] KVX = gather_cross( VXB_S[ lvl ], NVX_S[ lvl ], COL_VEC, VXBN ); __shared__ u64* vcbn[ NW ]; #define VCBN vcbn[ threadIdx.y ] VCBN = RUWS( NVC_S[ lvl ] ) + VCB_S[ lvl ]; __shared__ int kvc[ NW ]; #undef KVC #define KVC kvc[ threadIdx.y ] __shared__ int kvn[ NW ]; #define KVN kvn[ threadIdx.y ] KVC = gather_para( VCB_S[ lvl ], NVN_S[ lvl ], NVC_S[ lvl ], COL_VEC, VCBN, topbit( REM_S[ lvl ] ^ COL_VEC ), KVN ); set_vecc( which_colv[threadIdx.y], COL_VEC ); if( lvl > 0 ) { // Recursive call REM_S[ lvl - 1 ] = REM_S[ lvl ] ^ COL_VEC; VCB_S[ lvl - 1 ] = VCBN; NVC_S[ lvl - 1 ] = KVC; NVN_S[ lvl - 1 ] = KVN; VXB_S[ lvl - 1 ] = VXBN; NVX_S[ lvl - 1 ] = KVX; SINGLE_S[ lvl - 1 ] = SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC ); lvl --; goto Init; } else { // 31 / 42 LastCols( unsigned( REM_S[ lvl ] ^ COL_VEC ), VCBN, KVN, KVC - KVN, VXBN, KVX, SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC ), sub_count, sm_count ); } } // Scope limit for locals. ReInit: IVC_S[ lvl ] ++; goto Loop; } #endif // N > 3 __device__ void LastCols( unsigned rem, const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, u64* __restrict__ const & vxb, const int & nvx, const bool single, u64 & sub_count, int & sm_count ) { unsigned thread_bit = 1U << threadIdx.x; u64* thread_ep = & which_colv[ threadIdx.y ] [ threadIdx.x ]; __shared__ int ic21_s[ NW ]; #define IC21W ic21_s[ threadIdx.y ] int nc21 = nvn * ( nvcmn - 1 ); for( IC21W = 0; IC21W < nc21; IC21W += WS ) { bool v1ep = false; bool psingl = single; if( IC21W + threadIdx.x < nc21 ) { u64 v2 = vcb[ ( IC21W + threadIdx.x ) / ( nvcmn - 1 ) ]; unsigned v1 = unsigned( vcb[ nvn + ( IC21W + threadIdx.x ) % ( nvcmn - 1 ) ] ); unsigned v0 = rem ^ unsigned( v2 ) ^ v1; v1ep = ( v1 >> topbit( rem ^ unsigned( v2 ) ) ) and ( ( v1 & unsigned(v2) ) == 0 ) and ( v_max32_c >= __brev(v0) ); psingl = psingl or rev_max32_c == v2 or rev_max32_c == v1 or rev_max32_c == v0 ; } unsigned v1e = __brev( __ballot_sync( AllLanes, v1ep ) ); unsigned dbl = __ballot_sync( AllLanes, ! psingl ); int ic21 = 0; while( v1e ) { int skip = __clz( v1e ); ic21 += skip; u64 v2 = vcb[ ( IC21W + ic21 ) / ( nvcmn - 1 ) ]; unsigned v1 = unsigned( vcb[ nvn + ( IC21W + ic21 ) % ( nvcmn - 1 ) ] ); dbl >>= ( skip ); v1e <<= ( skip + 1 ); ic21 ++; if( unsigned( v2 ) & thread_bit ) *thread_ep = v2; if( unsigned( v2 >> WS ) & thread_bit ) *( thread_ep + WS ) = v2; if( ( rem ^ unsigned( v2 ) ) & thread_bit ) { *thread_ep = u64( v1 ^ ( ( v1 & thread_bit ) ? 0 : ( rem ^ unsigned( v2 ) ) ) ); // __syncwarp(); // synced in the gather_double_cross below. } u64* vxbn = RUWS( nvx ) + vxb; int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn );// 8/ 42 int shift = ( dbl & 1 ); dbl >>= 1; // avr( kvx ) : 8.6, 27% : > 10, 0.5% : > 20, 0.0005% : > 30 while( kvx > MDIAG ) { // 14 / 42 sub_count += MakeDiagS( vxbn, kvx ) << shift; vxbn ++; kvx --; } sub_count += MakeDiagC( vxbn, kvx ) << shift; sm_count += ( 1 << shift ); __syncwarp(); // !! // necessary to prevent shared data from being updated too early. } } return; } __device__ int MakeDiagS( const u64* __restrict__ const & vxb, const int & nvx ) { int diag_count = 0; u64 vdx = ( *vxb ) & ( ( 1ULL << 40 ) - 1 ); #pragma unroll 1 for( int ic = 1; ic < nvx; ic += WS ) { if( ic + threadIdx.x < nvx ) { u64 vcx = vxb[ ic + threadIdx.x ]; #if N%2 if( __popcll( vdx & vcx ) == 1 ) #else if( ( vdx & vcx ) == 0 ) #endif { // 4 - 10 / 42 if( FinalCheck( *vxb, vcx ) ) diag_count ++; } } } return diag_count; } __device__ int MakeDiagC( const u64* __restrict__ const & vx, const int & vxn ) { int diag_count = 0; #pragma unroll 1 for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) { if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) { int idx = p_idx_c[ ivx + threadIdx.x ]; u64 vdx = vx[ idx >> 16 ]; u64 vcx = vx[ idx & 0xffff ]; #if N%2 if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 ) #else if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 0 ) #endif { // 4-10 / 42 if( FinalCheck( vdx, vcx ) ) diag_count ++; } } } return diag_count; } __device__ bool FinalCheck( const u64 & vdx, const u64 & vcx ) { int* e2row = which_row[ threadIdx.y ] + NN - 64; u64* e2colv = which_colv[ threadIdx.y ]; int i; i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) ]; if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) ] != i ) return false; i = 40 + ( ( ( i & 1 ) + 1 ) << 3 ); return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) ] == e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) ] ) ; } #ifndef noMD5 void MakeMD5( u64 max_p, u64 second_p, u64 count, char md5str[] ) { MD5_CTX md5ctx; unsigned char md5bin[ MD5_DIGEST_LENGTH ]; const char* sample = "e00000007 1c0000038 494199069345"; int msg_len = strlen( sample ); char msg[ msg_len + 1 ]; sprintf( msg, "%09llx %09llx %15llu", max_p, second_p, count ); MD5_Init( &md5ctx ); MD5_Update( &md5ctx, msg, msg_len ); MD5_Final( md5bin, &md5ctx ); for( int i = 0; i < MD5_DIGEST_LENGTH; i ++ ) sprintf( md5str + ( i * 2 ), "%02x", md5bin[ i ] ); md5str[ MD5_DIGEST_LENGTH * 2 ] = 0; } #endif int main( int argc, char* argv[] ) { fprintf( stderr, "%s\n", __FILE__ ); fprintf( stderr, "Size of long long unsigned : %lu \n", sizeof( u64 ) ); if( sizeof( u64 ) * 8 < NN ) { fprintf( stderr, "Size of long long is too small : %lu \n", sizeof( u64 ) ); exit( 1 ); } if( check_topbit( ) ) { fprintf( stderr, "topbit() does't work correctly on this platform.\n" ); exit( 1 ); } GenVecs( NN, ( ( NN + 1 ) * N ) / 2, N - 1, 0ULL ); fprintf( stderr, "nvecs %d\n", nvecs ); /* pair indexes ----------------------------------- */ int p_offset[ MDIAG + 2 ]; int p_idx[ MDPAIR ]; int ioff = 0; for( int ndiag = 0; ndiag < MDIAG + 1; ndiag ++ ) { p_offset[ ndiag ] = ioff; for( int idx = 0; idx < ndiag - 1; idx ++ ) { for( int icx = idx + 1; icx < ndiag; icx ++ ) { p_idx[ ioff ++ ] = ( idx << 16 ) | icx; } } } p_offset[ MDIAG + 1 ] = ioff; fprintf( stderr, "size of p_idx_c %d \n", ioff ); if( ioff >= MDPAIR ) { fprintf(stderr, "p_idx_c index out of ragne. %d >= %d\n", ioff, MDPAIR ); exit( 1 ); } /* ------------------------------------------------ */ // Get device properties // Set a part of constant memory // int ndev; CU_SAFE( cudaGetDeviceCount( &ndev ) ); vector<int> dev_blocks( ndev ); totalwarps = 0; for( int idev = 0 ; idev < ndev; idev ++ ) { CU_SAFE( cudaSetDevice( idev ) ); cudaDeviceProp prop; CU_SAFE( cudaGetDeviceProperties( &prop, idev ) ); int nblocks = prop.multiProcessorCount * NB_SM; dev_blocks[ idev ] = nblocks; fprintf( stderr, "DevId:%2d #SM:%d, #Blocks:%d %s\n", idev, prop.multiProcessorCount, nblocks, prop.name); CU_SAFE( cudaMemcpyToSymbol( p_offset_c, p_offset, sizeof( int ) * ( MDIAG + 2 ) ) ); CU_SAFE( cudaMemcpyToSymbol( p_idx_c, p_idx, sizeof( int ) * ioff ) ); totalwarps += nblocks * NW; } fprintf( stderr, "Total warps: %5d\n", totalwarps ); // u64 row1P = strtoll( argv[ 2 ], NULL, 16 ); u64 row2P = strtoll( argv[ 4 ], NULL, 16 ); fprintf( stderr, "r1:%09llx r2:%09llx\n", row1P, row2P ); u64 total_count = 0ULL; u64 total_sm_count = 0ULL; for( int max_row = 0; max_row < nvecs; ++ max_row ) { u64 max_rowP = vecs[ max_row ]; if( row1P != 0ULL and row1P < max_rowP ) continue; if( max_rowP < row1P ) break; if( ! ( max_rowP & 1ULL << ( NN - 1 ) ) ) break; if( max_rowP < Reverse( max_rowP ) ) continue; for( int second_row = max_row + 1; second_row < nvecs; ++ second_row ) { u64 second_rowP = vecs[ second_row ]; if( row2P != 0ULL and row2P < second_rowP ) continue; if( second_rowP < row2P ) break; if( max_rowP & second_rowP ) continue; if( max_rowP < Reverse( second_rowP ) ) continue; u64 rem = AllBits ^ max_rowP ^ second_rowP; if( rem > second_rowP ) break; count_m = 0ULL; sm_count_m = 0ULL; Count( max_row, second_row, dev_blocks ); printf( "%05d %09llx ", max_row, vecs[ max_row ] ); printf( "%05d %09llx ", second_row, vecs[ second_row ] ); printf( "%15llu ", count_m ); #ifndef noMD5 char md5str[ MD5_DIGEST_LENGTH * 2 + 1 ]; MakeMD5( vecs[ max_row ], vecs[ second_row ], count_m, md5str ); printf( "%s", md5str ); #endif printf( "\n"); printf( "%05d %09llx ", max_row, vecs[ max_row ] ); printf( "%05d %09llx ", second_row, vecs[ second_row ] ); printf( "S %13llu ", sm_count_m ); #ifndef noMD5 MakeMD5( vecs[ max_row ], vecs[ second_row ], sm_count_m, md5str ); printf( "%s", md5str ); #endif printf( "\n"); total_count += count_m; total_sm_count += sm_count_m; } } if( row2P == 0ULL ) { fprintf( stderr, "Total: %17llu\n", total_count ); fprintf( stderr, "Total_SM:%15llu\n", total_sm_count ); } return 0; }
ms-20240410.cu.txt · Last modified: 2024/04/12 23:24 by 127.0.0.1