The number of magic squares of order 6

An execution example on GeForce RTX-4090

$ nvcc -O3 -arch=sm_60 -maxrregcount=40 ms-20240410.cu -lcrypto
$ time ./a.out 00000 e00000007 13745 180800250
ms-20240410.cu
Size of long long unsigned : 8 
topbit() works correctly.
nvecs 32134
size of p_idx_c 2300 
DevId: 0  #SM:128, #Blocks:2048 NVIDIA GeForce RTX 4090
Total warps:  6144
r1:e00000007 r2:180800250
nrows 3116  n3rd 711  
row_set.size() 5956644,   242 sets/warp 
00000 e00000007 13745 180800250    247438137089 824ce60dca0861defc383ee53d3367e7
00000 e00000007 13745 180800250 S  286856560469 65e40aa25a4edf87381bb51748779337

real    1m56.533s
user    1m56.274s
sys     0m0.256s

ms-20240410.cu

/*
ms.cu
 
Counting number of magic squares up to rotations, 
reflections and M tranformations.
 
Simultaneously counting semi-magic squares.
 
Compilation :
 
  nvcc -O3 -arch=sm_60 -maxrregcount=40 \
       -Wno-deprecated-declarations ms.cu -lcrypto 
 
  To omit md5 checksum, add a -DnoMD5 option. Then you may drop 
  -Wnodeprecated-declarations and -lcypto.
 
  -DV option produces verbose outputs for N=6. 
 
Command line options:
 
  % ./a.out  <1stRowNo> <1stRowPattern> <2ndRowNo> <2ndRowPattern>
  example : ./a.out 00000 e00000007 13745 180800250
 
See also : https://magicsquare6.net/
 
Updated on 2024/04/10.
Updated on 2023/11/28: Critical __syncwarp() added in LastCols().
 
Authored by Hidetoshi Mino hidetoshimino@gmail.com .
 
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <assert.h>
#ifndef noMD5
  #include <openssl/md5.h>
#endif
#include <vector>
using namespace std;
 
#ifndef N          // order of square : 3..6
  #define N 6
#endif
 
#ifndef NB_SM      // Number of blocks per SM
  #define NB_SM 16
#endif
 
#ifndef NW         // Number of Warps per block
  #define NW 3
#endif
 
#ifndef NSMPL
  #define NSMPL 1
#endif
 
#ifndef MDIAG
  #define MDIAG 24
#endif
 
#ifndef CINTLIM
  #define CINTLIM 26
#endif
 
typedef long long unsigned u64;
 
#define NN (N*N)
#define AllBits ( ( 1ULL << NN ) - 1 )
#define AllLanes (0xffffffff)
 
#define MAX( x, y ) ( (x) > (y) ? (x) : (y) )
#define CU_SAFE(stmt) \
  do { \
    cudaError_t status = stmt; \
    if (status != cudaSuccess) { \
       printf("ERROR: %s in %s:%d.\n", cudaGetErrorString(status), \
       __FILE__, __LINE__); \
       exit( 1 );  \
    } \
  } while (0)
 
#define WS 32   /* Warp size */
#define RUWS( x ) ( ( ( x ) + ( WS - 1 ) ) & ~( WS - 1 ) ) /* Round up to WS */
#define RDWS( x ) ( ( x ) & ~( WS - 1 ) )            /* Round down to WS */
 
__device__ __host__ int topbit( u64 v ) {
 
#ifdef __CUDA_ARCH__ 
   return 63 - __clzll( v );
#else
   v = v & ( ~( v >> 1 ) );
   float f32 = (float) v;
   int pos = *( (int *) &f32 );
   pos =( pos >> 23 ) - 0x7f;
   return pos;
#endif
/*
  int pos = 0;
  int v32 = v >> 32;
  if( v32 ) {
      pos = 32;
  } else {
     v32 = (int) v;
  }
  if( v32 & 0xffff0000UL ) { pos +=16;  v32 >>= 16; }
  if( v32 & 0x0000ff00UL ) { pos += 8;  v32 >>=  8; }
  if( v32 & 0x000000f0UL ) { pos += 4;  v32 >>=  4; }
  if( v32 & 0x0000000cUL ) { pos += 2;  v32 >>=  2; }
  if( v32 & 0000000002UL ) pos++;
  return pos;
*/
}
 
__device__ __host__ u64 reverse64( u64 x ) {
 
#ifdef __CUDA_ARCH__
 
   return __brevll( x );
 
#else
   u64 y;
   y = ( ( x & 0x5555555555555555ULL ) << 1 );
   x = ( y | ( ( x >> 1 ) & 0x5555555555555555ULL ) );
   y = ( ( x & 0x3333333333333333ULL ) << 2 );
   x = ( y | ( ( x >> 2 ) & 0x3333333333333333ULL ) );
   y = ( ( x & 0x0F0F0F0F0F0F0F0FULL ) << 4 );
   x = ( y | ( ( x >> 4 ) & 0x0F0F0F0F0F0F0F0FULL ) );
   y = ( ( x & 0x00FF00FF00FF00FFULL ) << 8 );
   x = ( y | ( ( x >> 8 ) & 0x00FF00FF00FF00FFULL ) );
   y = ( ( x & 0x0000FFFF0000FFFFULL ) << 16 );
   x = ( y | ( ( x >> 16 ) & 0x0000FFFF0000FFFFULL ) );
   y = ( ( x & 0x00000000FFFFFFFFULL ) << 32 );
   x = ( y | ( ( x >> 32 ) & 0x00000000FFFFFFFFULL ) );
   return x;
#endif
}
 
int check_topbit( ) {
 
   for( int i = 0; i < NN; i ++ ) {
      // printf( "%d %09llx %d\n", i, 1ULL << i, topbit( 1ULL << i ) );
      if( topbit( 1ULL << i ) != i ) return 1;
   }
   for( int i = 1; i < NN; i ++ ) {
      // printf( "%d %09llx %d\n"
      //         , i, ( 1ULL << i ) - 1, topbit( ( 1ULL << i ) - 1 ) );
      if( topbit( ( 1ULL << i ) - 1 ) != i - 1 ) return 1;
   }
   fprintf( stderr, "topbit() works correctly.\n" );
   return 0;
}
 
__device__ __host__ int vcsize_f( int ncols, int order ) 
{
  return RUWS( ncols )  * ( order - 2 ) + WS; 
}
 
__device__ __host__ int vxsize_f( int ndiags, int order ) 
{
  return RUWS( ndiags ) * ( order - 1 ) + WS;
}
 
/* = Host globals ======================= */
 
#define MVECS 32768
int nvecs = 0;
u64 vecs[ MVECS ];
int totalwarps;
 
/* = Constant memory ==================== */
 
__constant__ u64 v_max_c;
__constant__ unsigned v_max32_c;
__constant__ u64 rev_max_c;
__constant__ unsigned rev_max32_c;
__constant__ u64 v_second_c;
__constant__ bool single_c;
__constant__ int nrowset_c;
__constant__ int ncols_c, ndiags_c;
__constant__ int vcsize_c, vxsize_c;
__constant__ int nbatch_c;
__constant__ int totalwarps_c;
 
#define MDPAIR ( MDIAG * MDIAG * MDIAG / 5 + ( WS - 1 ) * MDIAG )
 
__constant__ int p_offset_c[ MDIAG + 2 ];
__constant__ int p_idx_c[ MDPAIR ];
 
/* = Block __shared__ ==================== */
 
__shared__ int which_row[ NW ][ NN ]; 
__shared__ u64 which_colv[ NW ][ NN ]; 
 
/* = Managed ( Unified ) =================== */
 
__managed__ int i_rowset_m = 0;
 
__managed__ u64 count_m = 0ULL;
__managed__ u64 sm_count_m = 0ULL;
 
/* ========================================= */
 
void Count( int max_row, int second_row, const vector<int> & dev_blocks );
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res );
 
__global__  void RowSet( u64* row_set_d, u64* vc_g, u64* vx_g );
__device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, 
                         u64* __restrict__ const & vxb, const int & nvx, 
                         const bool & single, int & sm_count );
__device__ void LastCols( unsigned rem, 
       const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn,
       u64* __restrict__ const & vxb, const int & nvx, const bool single, 
       u64 & sub_count, int & sm_count );
__device__ int MakeDiagC( 
       const u64* __restrict__ const & vxb, const int & nvx );
__device__ int MakeDiagS( 
       const u64* __restrict__ const & vxb, const int & nvx );
__device__ bool FinalCheck( const u64 & vdx, const u64 & vcx );
 
int Cross( int v1, int v2 ) {
   u64 v = vecs[v1] & vecs[v2];
   return v && ( ! ( v & ( v - 1 ) ) );
}
 
__device__ __host__ u64 Reverse( u64 v ) 
{ 
  return reverse64( v ) >> (64 - NN); 
}
 
void GenVecs( int max_ent, int rem_sum, int level, u64 vp ) 
{
  if ( level >= 0 ) {
    for ( int ent = max_ent; ent > 0 ; --ent ) {
      u64 v = 1ULL << ( ent - 1 );
      if ( rem_sum >= ent ) 
        GenVecs( ent - 1, rem_sum - ent, level - 1, vp | v );
    }
  }
  else {
    if ( rem_sum == 0 )  {
       vecs[ nvecs ++ ] = vp;
    }
  }
  if( nvecs > MVECS ) {
    fprintf( stderr, "nvecs > MVECS\n" );
    exit(1);
  }
}
 
void Count( int max_row, int second_row, const vector<int> & dev_blocks ) 
{
  u64 v_max = vecs[ max_row ];
  unsigned v_max32 = ( NN > 32 ) ? v_max >> ( NN - 32 ) : v_max << ( 32 - NN );
  u64 v_second = vecs[ second_row ];
  u64 rem = AllBits ^ v_max ^ v_second;
  u64 must_bit = ( 1ULL <<  topbit( rem ) );
 
  vector<u64> rows;  rows.reserve( nvecs );
  vector<u64> cols;  cols.reserve( nvecs );
  vector<u64> diags; diags.reserve( nvecs );
 
  int n3rd = 0;
  for( int i = second_row + 1; i < nvecs; ++ i ) 
    if( ( ! ( v_max    & vecs[ i ] ) ) &&  ( ! ( v_second & vecs[ i ] ) )
        && v_max >= Reverse( vecs[ i ] ) )  
    {
       rows.push_back( vecs[ i ] );
       if( vecs[ i ] & must_bit ) n3rd ++;
    }
  int nrows = rows.size();
 
  vector<u64> row_set; row_set.reserve( n3rd * 3000 * ( N - 2 ) );
  u64 vr[ nrows * ( N - 2 ) ]; 
  for( int i = 0; i < nrows; i ++ ) vr[ i ] = rows[ i ];
  fprintf( stderr, "nrows %d  n3rd %d  \n", nrows, n3rd );
 
  MakeRows( N - 2, 0, rem, vr, vr + nrows, row_set );
  fprintf( stderr, "row_set.size() %7lu, %5lu sets/warp \n", 
                    row_set.size(), row_set.size() / ( N - 2 ) / totalwarps );
  int nbatch_ini = max( row_set.size() / ( N - 2 ) / totalwarps / 3, 1ul );
 
  for( int i = max_row + 1; i < nvecs; ++ i ) 
    if( Cross( max_row, i ) && Cross( second_row, i ) 
        && v_max >= Reverse( vecs[ i ] ) )
    {
       cols.push_back( vecs[ i ] );
    }
  int ncols = cols.size();
 
  for( int i = 0; i < nvecs; ++ i ) 
    if( Cross( max_row, i ) && Cross( second_row, i ) ) 
       diags.push_back( vecs[ i ] );
  int ndiags = diags.size();
 
  /* constant parameters for the __global__ RowSet()  */
 
  u64 rev_max = Reverse( v_max );
  unsigned rev_max32 = unsigned( rev_max );
  bool single = rev_max == v_max or rev_max == v_second;
  int nrowset = row_set.size();
  int vcsize = vcsize_f( ncols, N );
  int vxsize = vxsize_f( ndiags, N );
 
  // Device Memory Allocation -------------------
 
  int ndev = dev_blocks.size();
 
  u64* row_set_d[ ndev ]; 
  u64* vc_d[ ndev ]; 
  u64* vx_d[ ndev ];
 
  i_rowset_m = 0;  // initialize managed variable
  // count_m = 0ULL; // Don't initialize here.
 
  for( int idev = 0 ; idev < ndev; idev ++ ) {
 
    CU_SAFE( cudaSetDevice( idev ) );
 
    int nblocks = dev_blocks[ idev ];
 
    CU_SAFE( cudaMalloc( row_set_d + idev, sizeof( u64 ) * row_set.size() ) );
    CU_SAFE( cudaMemcpy( row_set_d[ idev ], row_set.data(), 
                sizeof( u64 ) * row_set.size(), cudaMemcpyHostToDevice ) );
 
    CU_SAFE( cudaMalloc( vc_d + idev, 
                sizeof(u64) * ( RUWS( ncols )  + vcsize * nblocks * NW ) ) );
    CU_SAFE( cudaMalloc( vx_d + idev, 
                sizeof(u64) * ( RUWS( ndiags ) + vxsize * nblocks * NW ) ) );
    CU_SAFE( cudaMemcpy( vc_d[ idev ], cols.data(), ncols*sizeof(u64), 
                cudaMemcpyHostToDevice ) );
    CU_SAFE( cudaMemcpy( vx_d[ idev ], diags.data(), ndiags*sizeof(u64), 
                cudaMemcpyHostToDevice ) );
 
    // constant memory setup --------------------
 
    CU_SAFE( cudaMemcpyToSymbol( v_max_c, &v_max, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( v_max32_c, &v_max32, sizeof(unsigned) ) );
    CU_SAFE( cudaMemcpyToSymbol( v_second_c, &v_second, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( rev_max_c, &rev_max, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( rev_max32_c, &rev_max32, sizeof(unsigned) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( single_c, &single, sizeof( bool ) ) );
    CU_SAFE( cudaMemcpyToSymbol( nrowset_c, &nrowset, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( ncols_c, &ncols, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( ndiags_c, &ndiags, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( vcsize_c, &vcsize, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( vxsize_c, &vxsize, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( nbatch_c, &nbatch_ini, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( totalwarps_c, &totalwarps, sizeof( int ) ) );
 
    // ------------------------------------------
 
    RowSet <<< nblocks, dim3( WS, NW ) >>> ( 
                  row_set_d[ idev ], vc_d[ idev ], vx_d[ idev ] );
  }
 
  for( int idev = 0 ; idev < ndev; idev ++ ) {
 
    CU_SAFE( cudaSetDevice( idev ) );
    // cudaDeviceSynchronize();
 
    // Device Memory Deallocation ----------------
 
    CU_SAFE( cudaFree( vx_d[ idev ] ) ); 
    CU_SAFE( cudaFree( vc_d[ idev ] ) );  
    CU_SAFE( cudaFree( row_set_d[ idev ] ) );
  }
 
  return; 
}
 
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res )
{
  static u64 row_vecs[ N ];
 
  u64 must_bit = ( 1ULL <<  topbit( rem ) );
 
  for(   ; vrb <  vre; vrb += NSMPL ) {
 
     u64 row_vec = *vrb;
     if( ! ( row_vec & must_bit ) ) break;
 
     row_vecs[ l ] = row_vec;
 
     if( m > l + 1 ) {
        u64* vrp = vrb + 1;
        u64* vrq = vre;
        while( vrp < vre ) {
           *vrq = *vrp;
           vrq += ( row_vec & *vrp ) ? 0 : 1;
           vrp ++;
        }
        MakeRows( m, l + 1, rem ^ row_vec, vre, vrq, res );
     } else {
        for( int i = 0; i < m; i ++ ) {
           res.push_back( row_vecs[ i ] );
        }
     }
  }
}
 
__device__ int gather_para( 
      const u64* __restrict__ const vp, const int & bv, const int & ev,
      const u64 & x, u64* __restrict__ const & vpn, 
      const int must_bit, int & nvn ) 
{
   int j = 0;
   nvn = 0;
   for( int i = /*RDWS*/(bv); i < ev; i += WS ) {
      u64 v = -1ULL;
      if( i + threadIdx.x < ev ) {
         v = vp[ i + threadIdx.x ];
      }
      bool p = ! ( v & x );
      bool nesp = p and ( v >> must_bit );
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      nvn += __popc( __ballot_sync( AllLanes, nesp ) );
      if( p ) {
         vpn[ j + sub_sum ] = v;
      }
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int cnext( const u64* __restrict__ vp, 
              const int & nv, const int must_bit )
{
   int j = 0;
   for( int i = 0; i < nv; i += WS ) {
      bool p = false;
      if( i + threadIdx.x < nv ) {
         p = vp[ i + threadIdx.x ] >> must_bit;
      }
      j += __popc( __ballot_sync( AllLanes, p ) );
      if( j != i + WS ) break;
   }
   return j;
}
 
__device__ int gather_cross( 
       const u64* __restrict__ const & vp, const int & nv, 
       const u64 & x, u64* __restrict__ const & vpn )
{
   int j = 0;
   for( int i = 0; i < nv; i += WS ) {
      u64 v = 0ULL;
      if( i + threadIdx.x < nv ) 
          v = vp[ i + threadIdx.x ];
      bool p = __popcll( v & x ) == 1; 
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int gather_double_cross( 
        const u64* __restrict__ const & vp, const int & nv, 
        const u64 & x1, const u64 x2, u64* __restrict__ const & vpn )
{
   int j = 0;
   for( int i = 0; i < nv; i += WS ) {
      u64 v = 0ULL;
      if( i + threadIdx.x < nv ) 
          v = vp[ i + threadIdx.x ];
      bool p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1;
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int gather_triple_cross( 
     const u64* __restrict__ const & vp, const int & nv, 
     const u64 & x1, const u64 & x2, const u64 & x3, 
     u64* __restrict__ const & vpn )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      int i_t = i + threadIdx.x;
      u64 v = 0ULL;
      if( i_t < nv ) v = vp[ i_t ];
      bool p = __popcll( v & x1 ) == 1 
           and __popcll( v & x2 ) == 1
           and __popcll( v & x3 ) == 1;
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ void set_vecr( u64 vecs[ ], int index[ ], 
                             const u64 & v, const int & level ) 
{
  vecs[ level ] = v;
  if( ( v >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - threadIdx.x ] = level;
  }
  if( ( ( v >> WS ) >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - ( WS + threadIdx.x ) ] = level;
  }
  __syncwarp();
}
 
__device__ void set_vecc( u64 index[ ], const u64 & v ) 
{
  if( ( v >> threadIdx.x ) & 1 )  {
     index[ threadIdx.x ] = v;
  }
  if( ( ( v >> WS ) >> threadIdx.x ) & 1 )  {
     index[ WS + threadIdx.x ] = v;
  }
  __syncwarp();
}
 
__global__ void RowSet( u64* rowset, u64* vc_g, u64* vx_g )
{
  __shared__ u64 row_vecs[ NW ][ N ]; 
  #define ROW_VECS row_vecs[ threadIdx.y ]
 
  #define WHICH_ROW which_row[ threadIdx.y ]
 
  set_vecr( ROW_VECS, WHICH_ROW, v_max_c, N - 1 );
  set_vecr( ROW_VECS, WHICH_ROW, v_second_c, N - 2 );
 
  __shared__ u64* vc_s[ NW ];
  __shared__ u64* vx_s[ NW ];
  #define VC vc_s[ threadIdx.y ]
  #define VX vx_s[ threadIdx.y ]
 
  VC = vc_g + RUWS( ncols_c )  + vcsize_c * ( blockIdx.x * NW + threadIdx.y );
  VX = vx_g + RUWS( ndiags_c ) + vxsize_c * ( blockIdx.x * NW + threadIdx.y );
 
  VC[ vcsize_c - WS ] = 0ULL;
  VX[ vxsize_c - WS ] = 0ULL;
 
  u64 count = 0ULL;
  u64 sm_count = 0ULL;
 
  __shared__ int nbatch_s[ NW ];
  #define NBATCH nbatch_s[ threadIdx.y ]
  NBATCH = nbatch_c;
 
  for(   ; /* i_rowset < nrowset_c */; /* i_rowset += N - 2 */ ) {
 
     __shared__ int i_rowbat[ NW ];
     #define I_ROWBAT i_rowbat[ threadIdx.y ]
 
     if( threadIdx.x == 0 ) {
        I_ROWBAT = atomicAdd_system( &i_rowset_m, ( N - 2 ) * NBATCH );
     }
     __syncwarp();
 
     if( I_ROWBAT >= nrowset_c ) break;
 
     __shared__ int i_set[ NW ];
     #define I_SET i_set[ threadIdx.y ]
     I_SET = 0; 
     while( I_SET < NBATCH and I_ROWBAT + I_SET * ( N - 2 ) < nrowset_c ) {
 
       __shared__ bool single[ NW ];
       #define SINGLE single[ threadIdx.y ]
       SINGLE = single_c;
       for( int i = 0; i < N - 2; i ++ ) {
          u64 row_vec = rowset[ I_ROWBAT + ( I_SET * ( N - 2 ) ) + i ];
          set_vecr( ROW_VECS, WHICH_ROW, row_vec, N - 3 - i );
          SINGLE = SINGLE or rev_max_c == row_vec;
       }
 
       __shared__ int kvc_s[ NW ];
       __shared__ int kvx_s[ NW ];
       #define KVC kvc_s[ threadIdx.y ]
       #define KVX kvx_s[ threadIdx.y ]
 
#if N == 6
       KVC = gather_triple_cross( vc_g, ncols_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VC );
       KVX = gather_triple_cross( vx_g, ndiags_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VX );
#endif
#if N == 5
       KVC = gather_double_cross( vc_g, ncols_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VC );
       KVX = gather_double_cross( vx_g, ndiags_c,
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VX );
#endif
#if N == 4
       KVC = gather_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], VC );
       KVX = gather_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], VX );
#endif
#if N == 3
       for( int i = 0; i < ncols_c; i++ ) VC[ i ] = vc_g[ i ];
       for( int i = 0; i < ndiags_c; i++ ) VX[ i ] = vx_g[ i ];
       KVC = ncols_c;
       KVX = ndiags_c;
#endif
 
       for( int iw = 0; iw < KVX; iw += WS ) {
         if( iw + threadIdx.x < KVX ) {
            u64 v = VX[ iw + threadIdx.x ];
            VX[ iw + threadIdx.x ]  |=
              u64 (    ( 63 - __clzll ( v & ROW_VECS[ 0 ] ) )
                   | ( ( 63 - __clzll ( v & ROW_VECS[ 1 ] ) ) <<  8 )
                   | ( ( 63 - __clzll ( v & ROW_VECS[ 2 ] ) ) << 16 ) 
                  ) << 40;
         }
       }
       __syncwarp();
 
#if N > 3
       int sm_count_cols = 0;
       count += MakeCols( VC, KVC, VX, KVX, SINGLE, sm_count_cols ) ;
       assert( ( sm_count_cols >> CINTLIM ) == 0 );
       sm_count += sm_count_cols;
#else
       u64 sub_count = 0;
       int sm_count_cols = 0;
       int nvn = cnext( VC, KVC, NN - 1 );
       LastCols( AllBits, VC, nvn, KVC - nvn, VX, KVX, SINGLE, 
                                         sub_count, sm_count_cols );
       count += sub_count;
       sm_count += sm_count_cols;
#endif
       #if N > 5
       #ifdef V
         if( blockIdx.x == 0 and threadIdx.x == 0 and threadIdx.y == 0 ) {
            printf( "r3:%09llx %9llu %9llu %5d\n", 
                 ROW_VECS[ N - 3 ], count, sm_count, NBATCH );
         }
       #endif
       #endif
 
       if( threadIdx.x == 0) I_SET++;
       __syncwarp();
    }
    if( threadIdx.x == 0 ) 
        NBATCH = max( (nrowset_c - i_rowset_m)/(N - 2)/totalwarps_c/3, 1 );
    __syncwarp();
  }
  if ( VC[ vcsize_c - WS ] != 0ULL or
       VX[ vxsize_c - WS ] != 0ULL ) {
     printf( " Array over run ! \n");
     assert( 0 );
     // exit( 1 );
  }
 
  for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) {
     count += __shfl_xor_sync( AllLanes, count, span );
  }
  if( threadIdx.x == 0 ) 
  {
     atomicAdd_system( &count_m, count );
     atomicAdd_system( &sm_count_m, sm_count );
  }
}
 
#if N > 3
__device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, 
                         u64* __restrict__ const & vxb, const int & nvx, 
                         const bool & single, int & sm_count)
{
  __shared__ u64  rem_s[ NW ][ N - 3 ];
  __shared__ u64* vcb_s[ NW ][ N - 3 ];
  __shared__ int  nvc_s[ NW ][ N - 3 ]; 
  __shared__ int  nvn_s[ NW ][ N - 3 ];
  __shared__ u64* vxb_s[ NW ][ N - 3 ]; 
  __shared__ int  nvx_s[ NW ][ N - 3 ]; 
  __shared__ bool  single_s[ NW ][ N - 3 ]; 
 
  #define REM_S rem_s[ threadIdx.y ]
  #define VCB_S vcb_s[ threadIdx.y ]
  #define NVC_S nvc_s[ threadIdx.y ]
  #define NVN_S nvn_s[ threadIdx.y ]
  #define VXB_S vxb_s[ threadIdx.y ]
  #define NVX_S nvx_s[ threadIdx.y ]
  #define SINGLE_S single_s[ threadIdx.y ]
 
  int lvl = N - 4;
  REM_S[ lvl ] = AllBits;//rem;
  VCB_S[ lvl ] = vcb;
  NVC_S[ lvl ] = nvc;
  NVN_S[ lvl ] = cnext( vcb, nvc, NN - 1 );
  VXB_S[ lvl ] = vxb;
  NVX_S[ lvl ] = nvx;
  SINGLE_S[ lvl ] = single;
 
  __shared__ int  ivc_s[ NW ][ N - 3 ];
  #define IVC_S ivc_s[ threadIdx.y ]
 
  u64 sub_count = 0;
  // int sm_count = 0;
 
  Init:
 
     IVC_S[ lvl ] = 0;
 
 //  for(  ; ivc_s[ lvl ] < nvn_s[ lvl ]; ivc_s[ lvl ] ++  ) {
 
  Loop:
 
     if( IVC_S[ lvl ] >= NVN_S[ lvl ] ) {  // Recursive return
        if( lvl == N - 4 ) 
           return sub_count;
        else {
           lvl ++;
           goto ReInit; 
        }
     }
 
{ // Scope limit for locals.
 
     __shared__ u64 col_vec[ NW ];
     #define COL_VEC col_vec[ threadIdx.y ]
     COL_VEC  = VCB_S[ lvl ][ IVC_S[ lvl ] ];
 
     __shared__ u64* vxbn[ NW ];
     #define VXBN vxbn[ threadIdx.y ]
     VXBN = RUWS( NVX_S[ lvl ] ) + VXB_S[ lvl ];
 
     __shared__ int kvx[ NW ];
     #undef KVX
     #define KVX kvx[ threadIdx.y ]
     KVX = gather_cross( VXB_S[ lvl ], NVX_S[ lvl ], COL_VEC, VXBN ); 
 
     __shared__ u64* vcbn[ NW ];
     #define VCBN vcbn[ threadIdx.y ]
     VCBN = RUWS( NVC_S[ lvl ] ) + VCB_S[ lvl ];
 
     __shared__ int kvc[ NW ];
     #undef KVC
     #define KVC kvc[ threadIdx.y ]
     __shared__ int kvn[ NW ];
     #define KVN kvn[ threadIdx.y ]
 
     KVC = gather_para( VCB_S[ lvl ], NVN_S[ lvl ], NVC_S[ lvl ],
                  COL_VEC, VCBN, topbit( REM_S[ lvl ] ^ COL_VEC ), KVN );  
 
     set_vecc( which_colv[threadIdx.y], COL_VEC );
 
     if( lvl > 0 ) {
	// Recursive call
        REM_S[ lvl - 1 ] = REM_S[ lvl ] ^ COL_VEC;
        VCB_S[ lvl - 1 ] = VCBN;
        NVC_S[ lvl - 1 ] = KVC;
 
 NVN_S[ lvl - 1 ] = KVN;
        VXB_S[ lvl - 1 ] = VXBN;
        NVX_S[ lvl - 1 ] = KVX;
        SINGLE_S[ lvl - 1 ] = SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC );
 
        lvl --;
        goto Init;
 
     } else {  // 31 / 42
        LastCols( unsigned( REM_S[ lvl ] ^ COL_VEC ), VCBN, KVN, KVC - KVN,
             VXBN, KVX, SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC ), 
             sub_count, sm_count );
     }
} // Scope limit for locals.
 
  ReInit:
    IVC_S[ lvl ] ++;
    goto Loop;
}
#endif // N > 3
 
__device__ void LastCols( unsigned rem, 
       const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, 
       u64* __restrict__ const & vxb, const int & nvx, 
       const bool single, u64 & sub_count, int & sm_count )
{
  unsigned thread_bit = 1U << threadIdx.x;
  u64* thread_ep =  & which_colv[ threadIdx.y ] [ threadIdx.x ];
 
  __shared__ int ic21_s[ NW ];
  #define IC21W ic21_s[ threadIdx.y ]
 
  int nc21 = nvn * ( nvcmn - 1 );
  for( IC21W = 0; IC21W < nc21; IC21W += WS ) {
 
    bool v1ep = false;
    bool psingl = single;
 
    if( IC21W + threadIdx.x < nc21 ) {
      u64 v2 = vcb[ ( IC21W + threadIdx.x ) / ( nvcmn - 1 ) ];
      unsigned v1 = unsigned( vcb[ nvn + ( IC21W + threadIdx.x ) % ( nvcmn - 1 ) ] );
      unsigned v0 = rem ^ unsigned( v2 ) ^ v1;
      v1ep = ( v1 >> topbit( rem ^ unsigned( v2 ) ) ) 
               and ( ( v1 & unsigned(v2) ) == 0 ) 
               and ( v_max32_c >= __brev(v0) );
      psingl = psingl 
                 or rev_max32_c == v2
                 or rev_max32_c == v1 
                 or rev_max32_c == v0 ;
    }
    unsigned v1e = __brev( __ballot_sync( AllLanes, v1ep ) );
    unsigned dbl = __ballot_sync( AllLanes, ! psingl );
 
    int ic21 = 0;
    while( v1e ) {
 
      int skip = __clz( v1e );
      ic21 += skip;
 
      u64 v2 = vcb[ ( IC21W + ic21 ) / ( nvcmn - 1 ) ];
      unsigned v1 = unsigned( vcb[ nvn + ( IC21W + ic21 ) % ( nvcmn - 1 ) ] );
 
      dbl >>= ( skip );
      v1e <<= ( skip + 1 );
      ic21 ++;
 
      if( unsigned( v2 ) & thread_bit ) *thread_ep = v2;
      if( unsigned( v2 >> WS ) & thread_bit ) *( thread_ep + WS ) = v2;
      if( ( rem ^ unsigned( v2 ) ) & thread_bit ) {
         *thread_ep 
           = u64( v1 ^ ( ( v1 & thread_bit ) ? 0 : ( rem ^ unsigned( v2 ) ) ) );
         // __syncwarp(); // synced in the gather_double_cross below.
      }
 
      u64* vxbn = RUWS( nvx ) + vxb;
      int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn );// 8/ 42
 
      int shift = ( dbl & 1 );
      dbl >>= 1;
 
      // avr( kvx ) : 8.6,  27% : > 10,  0.5% : > 20,  0.0005% : > 30
      while( kvx > MDIAG ) { // 14 / 42
        sub_count += MakeDiagS( vxbn, kvx ) << shift;
        vxbn ++;
        kvx --;
      }
      sub_count += MakeDiagC( vxbn, kvx ) << shift;
      sm_count += ( 1 << shift );
       __syncwarp(); // !!
         // necessary to prevent shared data from being updated too early.
    } 
  }
  return;
}
 
__device__ 
int MakeDiagS( const u64* __restrict__ const & vxb,  const int & nvx ) 
{
   int  diag_count = 0;
   u64 vdx = ( *vxb ) & ( ( 1ULL << 40 ) - 1 );
#pragma unroll 1
   for( int ic = 1; ic < nvx; ic += WS ) {
     if( ic + threadIdx.x < nvx ) {
       u64 vcx = vxb[ ic + threadIdx.x ];
#if N%2
       if( __popcll( vdx & vcx ) == 1 ) 
#else
       if( ( vdx & vcx )  == 0 ) 
#endif
       {  // 4 - 10 / 42
          if( FinalCheck( *vxb, vcx ) ) diag_count ++;
       }
     }
   }
   return diag_count;
}
 
__device__ 
int MakeDiagC( const u64* __restrict__ const & vx,  const int & vxn ) 
{
   int  diag_count = 0;
#pragma unroll 1
   for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) {
      if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) {
         int idx  = p_idx_c[ ivx + threadIdx.x ];
         u64 vdx = vx[ idx >> 16 ];
         u64 vcx = vx[ idx & 0xffff ];
#if N%2
         if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 ) 
#else
         if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) )  == 0 ) 
#endif
         {   // 4-10 / 42
            if( FinalCheck( vdx, vcx ) ) diag_count ++;
         }
      }
   }
   return diag_count;
}
 
__device__ bool FinalCheck( const u64 & vdx, const u64 & vcx )
{
   int* e2row = which_row[ threadIdx.y ] + NN - 64;
   u64* e2colv = which_colv[ threadIdx.y ];
   int i;
   i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) ];
   if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) ] != i ) 
       return false;
 
   i = 40 + ( ( ( i & 1 ) + 1 ) << 3 );
 
   return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) ] 
            ==
            e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) ] ) ;
}
 
#ifndef noMD5
void MakeMD5( u64 max_p, u64 second_p, u64 count, char md5str[] )
{
   MD5_CTX md5ctx;
   unsigned char md5bin[ MD5_DIGEST_LENGTH ];
   const char* sample = "e00000007 1c0000038    494199069345";
   int msg_len = strlen( sample );
   char msg[ msg_len + 1 ];
   sprintf( msg, "%09llx %09llx %15llu", max_p, second_p, count );
 
   MD5_Init( &md5ctx );
   MD5_Update( &md5ctx, msg, msg_len );
   MD5_Final( md5bin, &md5ctx );
 
   for( int i = 0; i < MD5_DIGEST_LENGTH; i ++ )
     sprintf( md5str + ( i * 2 ), "%02x", md5bin[ i ] );
   md5str[ MD5_DIGEST_LENGTH * 2 ] = 0;
}
#endif
 
int main( int argc, char* argv[] )
{
   fprintf( stderr, "%s\n", __FILE__ );
   fprintf( stderr, "Size of long long unsigned : %lu \n", sizeof( u64 ) );
   if( sizeof( u64 ) * 8 < NN ) {
      fprintf( stderr, 
               "Size of long long is too small : %lu \n", sizeof( u64 ) );
      exit( 1 );
   }
   if( check_topbit( ) ) {
      fprintf( stderr, "topbit() does't work correctly on this platform.\n" );
      exit( 1 );
   }
 
   GenVecs( NN, ( ( NN + 1 ) * N ) / 2, N - 1, 0ULL );
   fprintf( stderr, "nvecs %d\n", nvecs );
 
   /* pair indexes ----------------------------------- */
 
   int p_offset[ MDIAG + 2 ];
   int p_idx[ MDPAIR ];
 
   int ioff = 0;
   for( int ndiag = 0; ndiag < MDIAG + 1; ndiag ++ ) {
 
      p_offset[ ndiag ] = ioff;
      for( int idx = 0; idx < ndiag - 1; idx ++ ) {
         for( int icx = idx + 1; icx < ndiag; icx ++ ) {
             p_idx[ ioff ++ ] = ( idx << 16 ) | icx;
         }
      }
   }
   p_offset[ MDIAG + 1 ] = ioff;
 
   fprintf( stderr, "size of p_idx_c %d \n", ioff );
   if( ioff >= MDPAIR ) {
     fprintf(stderr, "p_idx_c index out of ragne. %d >= %d\n", ioff, MDPAIR );
     exit( 1 );
   }
 
   /* ------------------------------------------------ */
   // Get device properties
   // Set a part of constant memory
   //
   int ndev;
   CU_SAFE( cudaGetDeviceCount( &ndev ) );
   vector<int> dev_blocks( ndev );
   totalwarps = 0;
   for( int idev = 0 ; idev < ndev; idev ++ ) {
     CU_SAFE( cudaSetDevice( idev ) );
     cudaDeviceProp prop;
     CU_SAFE( cudaGetDeviceProperties( &prop, idev ) );
     int nblocks = prop.multiProcessorCount * NB_SM;
     dev_blocks[ idev ] = nblocks;
     fprintf( stderr, "DevId:%2d  #SM:%d, #Blocks:%d %s\n", 
                      idev, prop.multiProcessorCount, nblocks, prop.name);
 
     CU_SAFE( cudaMemcpyToSymbol( 
             p_offset_c, p_offset, sizeof( int ) * ( MDIAG + 2 ) ) );
     CU_SAFE( cudaMemcpyToSymbol( p_idx_c, p_idx, sizeof( int ) * ioff ) );
     totalwarps += nblocks * NW;
   }
   fprintf( stderr, "Total warps: %5d\n", totalwarps );
 
   // 
   u64 row1P = strtoll( argv[ 2 ], NULL, 16 );
   u64 row2P = strtoll( argv[ 4 ], NULL, 16 );
   fprintf( stderr, "r1:%09llx r2:%09llx\n", row1P, row2P );
 
   u64 total_count = 0ULL;
   u64 total_sm_count = 0ULL;
 
   for( int max_row = 0; max_row < nvecs; ++ max_row ) {
     u64 max_rowP = vecs[ max_row ];
     if( row1P != 0ULL and row1P < max_rowP ) continue;
     if( max_rowP < row1P ) break;
     if( ! ( max_rowP & 1ULL << ( NN - 1 ) ) ) break;
     if( max_rowP < Reverse( max_rowP ) ) continue;
     for( int second_row = max_row + 1; second_row < nvecs; ++ second_row ) {
         u64 second_rowP = vecs[ second_row ];
         if( row2P != 0ULL and row2P < second_rowP ) continue;
         if( second_rowP < row2P ) break;
         if( max_rowP & second_rowP ) continue;
         if( max_rowP < Reverse( second_rowP ) ) continue;
         u64 rem = AllBits ^ max_rowP ^ second_rowP;
         if( rem > second_rowP ) break;
 
         count_m = 0ULL;
         sm_count_m = 0ULL;
 
         Count( max_row, second_row, dev_blocks );
 
         printf( "%05d %09llx ", max_row, vecs[ max_row ] );
         printf( "%05d %09llx ", second_row, vecs[ second_row ] );
         printf( "%15llu ", count_m ); 
         #ifndef noMD5
            char md5str[ MD5_DIGEST_LENGTH * 2 + 1 ];
            MakeMD5( vecs[ max_row ], vecs[ second_row ], count_m, md5str );
            printf( "%s", md5str );
         #endif
         printf( "\n");
 
         printf( "%05d %09llx ", max_row, vecs[ max_row ] );
         printf( "%05d %09llx ", second_row, vecs[ second_row ] );
         printf( "S %13llu ", sm_count_m ); 
         #ifndef noMD5
            MakeMD5( vecs[ max_row ], vecs[ second_row ], sm_count_m, md5str );
            printf( "%s", md5str );
         #endif
         printf( "\n");
 
         total_count += count_m;
         total_sm_count += sm_count_m;
     }
   }
   if( row2P == 0ULL ) {
       fprintf( stderr, "Total: %17llu\n", total_count );
       fprintf( stderr, "Total_SM:%15llu\n", total_sm_count );
   }
   return 0;
}