The number of magic squares of order 6

An execution example on GeForce RTX-4090

$ nvcc -O3 -arch=sm_60 -maxrregcount=40 ms-20240504.cu -lcrypto
$ time ./a.out 00000 e00000007 13745 180800250
ms-20240504.cu
Size of long long unsigned : 8 
topbit() works correctly.
nvecs 32134
size of p_idx_c 1330 
DevId: 0  #SM:128, #Blocks:2048 NVIDIA GeForce RTX 4090
Total warps:  6144
r1:e00000007 r2:180800250
nrows 3116  n3rd 711  
row_set.size() 5956644,   242 sets/warp 
00000 e00000007 13745 180800250    247438137089 824ce60dca0861defc383ee53d3367e7
00000 e00000007 13745 180800250 S  286856560469 65e40aa25a4edf87381bb51748779337

real    1m59.888s
user    1m59.621s
sys     0m0.272s

Note added on 2024.06.20

The function void LastCols( ) in the code contains the following questionable comparison:

    psingl = psingl
               or rev_max32_c == v2

which should be replaced by

    psingl = psingl
               or rev_max32_c == unsigned( v2 )

because the upper 32 bits of v2 should be ignored when compared with the 32bit unsigned rev_max32_c. However, this flaw is irrelevant to the validity of the result because this logical operation is totally unnecessary and has no effect for even N (including 6) because complements of any rows never appear as columns in even order magic squares. And for N < 6, the upper 32 bits of v2 are always 0s and the cast has no effect in any way.

This part of the code and other parts related to the self complemenarity check should be removed from functions MakeCols() and LastCols() in order to optimize the performance for N=6. We will gain 3% improvement of the performance by this optimization.

ms-20240504.cu

/*
ms.cu
 
Counting number of magic squares up to rotations, 
reflections and M tranformations.
 
Simultaneously counting semi-magic squares.
 
Compilation :
 
  nvcc -O3 -arch=sm_60 -maxrregcount=40 \
       -Wno-deprecated-declarations ms.cu -lcrypto 
 
  To omit md5 checksum, add a -DnoMD5 option. Then you may drop 
  -Wnodeprecated-declarations and -lcypto.
 
  -DV option produces verbose outputs for N=6. 
 
Command line options:
 
  % ./a.out  <1stRowNo> <1stRowPattern> <2ndRowNo> <2ndRowPattern>
  example : ./a.out 00000 e00000007 13745 180800250
 
See also : https://magicsquare6.net/
 
Updated on 2024/05/04.
Updated on 2023/11/28: Critical __syncwarp() added in LastCols().
 
Authored by Hidetoshi Mino hidetoshimino@gmail.com .
 
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <assert.h>
#ifndef noMD5
  #include <openssl/md5.h>
#endif
#include <vector>
using namespace std;
 
#ifndef N          // order of square : 3..6
  #define N 6
#endif
 
#ifndef NB_SM      // Number of blocks per SM
  #define NB_SM 16
#endif
 
#ifndef NW         // Number of Warps per block
  #define NW 3
#endif
 
#ifndef NSMPL
  #define NSMPL 1
#endif
 
#ifndef MDIAG
  #define MDIAG 20
#endif
 
#ifndef CINTLIM
  #define CINTLIM 26
#endif
 
typedef long long unsigned u64;
 
#define NN (N*N)
#define AllBits ( ( 1ULL << NN ) - 1 )
#define AllLanes (0xffffffff)
 
#define MAX( x, y ) ( (x) > (y) ? (x) : (y) )
#define CU_SAFE(stmt) \
  do { \
    cudaError_t status = stmt; \
    if (status != cudaSuccess) { \
       printf("ERROR: %s in %s:%d.\n", cudaGetErrorString(status), \
       __FILE__, __LINE__); \
       exit( 1 );  \
    } \
  } while (0)
 
#define WS 32   /* Warp size */
#define RUWS( x ) ( ( ( x ) + ( WS - 1 ) ) & ~( WS - 1 ) ) /* Round up to WS */
#define RDWS( x ) ( ( x ) & ~( WS - 1 ) )            /* Round down to WS */
 
__device__ __host__ int topbit( u64 v ) {
 
#ifdef __CUDA_ARCH__ 
   return 63 - __clzll( v );
#else
   v = v & ( ~( v >> 1 ) );
   float f32 = (float) v;
   int pos = *( (int *) &f32 );
   pos =( pos >> 23 ) - 0x7f;
   return pos;
#endif
/*
  int pos = 0;
  int v32 = v >> 32;
  if( v32 ) {
      pos = 32;
  } else {
     v32 = (int) v;
  }
  if( v32 & 0xffff0000UL ) { pos +=16;  v32 >>= 16; }
  if( v32 & 0x0000ff00UL ) { pos += 8;  v32 >>=  8; }
  if( v32 & 0x000000f0UL ) { pos += 4;  v32 >>=  4; }
  if( v32 & 0x0000000cUL ) { pos += 2;  v32 >>=  2; }
  if( v32 & 0000000002UL ) pos++;
  return pos;
*/
}
 
__device__ __host__ u64 reverse64( u64 x ) {
 
#ifdef __CUDA_ARCH__
 
   return __brevll( x );
 
#else
   u64 y;
   y = ( ( x & 0x5555555555555555ULL ) << 1 );
   x = ( y | ( ( x >> 1 ) & 0x5555555555555555ULL ) );
   y = ( ( x & 0x3333333333333333ULL ) << 2 );
   x = ( y | ( ( x >> 2 ) & 0x3333333333333333ULL ) );
   y = ( ( x & 0x0F0F0F0F0F0F0F0FULL ) << 4 );
   x = ( y | ( ( x >> 4 ) & 0x0F0F0F0F0F0F0F0FULL ) );
   y = ( ( x & 0x00FF00FF00FF00FFULL ) << 8 );
   x = ( y | ( ( x >> 8 ) & 0x00FF00FF00FF00FFULL ) );
   y = ( ( x & 0x0000FFFF0000FFFFULL ) << 16 );
   x = ( y | ( ( x >> 16 ) & 0x0000FFFF0000FFFFULL ) );
   y = ( ( x & 0x00000000FFFFFFFFULL ) << 32 );
   x = ( y | ( ( x >> 32 ) & 0x00000000FFFFFFFFULL ) );
   return x;
#endif
}
 
int check_topbit( ) {
 
   for( int i = 0; i < NN; i ++ ) {
      // printf( "%d %09llx %d\n", i, 1ULL << i, topbit( 1ULL << i ) );
      if( topbit( 1ULL << i ) != i ) return 1;
   }
   for( int i = 1; i < NN; i ++ ) {
      // printf( "%d %09llx %d\n"
      //         , i, ( 1ULL << i ) - 1, topbit( ( 1ULL << i ) - 1 ) );
      if( topbit( ( 1ULL << i ) - 1 ) != i - 1 ) return 1;
   }
   fprintf( stderr, "topbit() works correctly.\n" );
   return 0;
}
 
__device__ __host__ int vcsize_f( int ncols, int order ) 
{
  return RUWS( ncols )  * ( order - 2 ) + WS; 
}
 
__device__ __host__ int vxsize_f( int ndiags, int order ) 
{
  return RUWS( ndiags ) * ( order - 1 ) + WS;
}
 
/* = Host globals ======================= */
 
#define MVECS 32768
int nvecs = 0;
u64 vecs[ MVECS ];
int totalwarps;
 
/* = Constant memory ==================== */
 
__constant__ u64 v_max_c;
__constant__ unsigned v_max32_c;
__constant__ u64 rev_max_c;
__constant__ unsigned rev_max32_c;
__constant__ u64 v_second_c;
__constant__ bool single_c;
__constant__ int nrowset_c;
__constant__ int ncols_c, ndiags_c;
__constant__ int vcsize_c, vxsize_c;
__constant__ int nbatch_c;
__constant__ int totalwarps_c;
 
#define MDPAIR ( MDIAG * ( MDIAG + 1 ) * ( MDIAG - 1 ) / 6 )
 
__constant__ int p_offset_c[ MDIAG + 2 ];
__constant__ int p_idx_c[ MDPAIR ];
 
/* = Block __shared__ ==================== */
 
__shared__ int which_row[ NW ][ NN ]; 
__shared__ u64 which_colv[ NW ][ NN ]; 
 
/* = Managed ( Unified ) =================== */
 
__managed__ int i_rowset_m = 0;
 
__managed__ u64 count_m = 0ULL;
__managed__ u64 sm_count_m = 0ULL;
 
/* ========================================= */
 
void Count( int max_row, int second_row, const vector<int> & dev_blocks );
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res );
 
__global__  void RowSet( u64* row_set_d, u64* vc_g, u64* vx_g );
__device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, 
                         u64* __restrict__ const & vxb, const int & nvx, 
                         const bool & single, int & sm_count );
__device__ void LastCols( unsigned rem, 
       const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn,
       u64* __restrict__ const & vxb, const int & nvx, const bool single, 
       u64 & sub_count, int & sm_count );
__device__ int MakeDiag( const u64* __restrict__ const & vxb, const int & nvx );
__device__ int MakeDiagC( 
       const u64* __restrict__ const & vxb, const int & nvx );
__device__ int MakeDiagL( 
       const u64* __restrict__ const & vxb, const int & nvx );
__device__ bool FinalCheck( const u64 & vdx, const u64 & vcx );
 
int Cross( int v1, int v2 ) {
   u64 v = vecs[v1] & vecs[v2];
   return v && ( ! ( v & ( v - 1 ) ) );
}
 
__device__ __host__ u64 Reverse( u64 v ) 
{ 
  return reverse64( v ) >> (64 - NN); 
}
 
void GenVecs( int max_ent, int rem_sum, int level, u64 vp ) 
{
  if ( level >= 0 ) {
    for ( int ent = max_ent; ent > 0 ; --ent ) {
      u64 v = 1ULL << ( ent - 1 );
      if ( rem_sum >= ent ) 
        GenVecs( ent - 1, rem_sum - ent, level - 1, vp | v );
    }
  }
  else {
    if ( rem_sum == 0 )  {
       vecs[ nvecs ++ ] = vp;
    }
  }
  if( nvecs > MVECS ) {
    fprintf( stderr, "nvecs > MVECS\n" );
    exit(1);
  }
}
 
void Count( int max_row, int second_row, const vector<int> & dev_blocks ) 
{
  u64 v_max = vecs[ max_row ];
  unsigned v_max32 = ( NN > 32 ) ? v_max >> ( NN - 32 ) : v_max << ( 32 - NN );
  u64 v_second = vecs[ second_row ];
  u64 rem = AllBits ^ v_max ^ v_second;
  u64 must_bit = ( 1ULL <<  topbit( rem ) );
 
  vector<u64> rows;  rows.reserve( nvecs );
  vector<u64> cols;  cols.reserve( nvecs );
  vector<u64> diags; diags.reserve( nvecs );
 
  int n3rd = 0;
  for( int i = second_row + 1; i < nvecs; ++ i ) 
    if( ( ! ( v_max    & vecs[ i ] ) ) &&  ( ! ( v_second & vecs[ i ] ) )
        && v_max >= Reverse( vecs[ i ] ) )  
    {
       rows.push_back( vecs[ i ] );
       if( vecs[ i ] & must_bit ) n3rd ++;
    }
  int nrows = rows.size();
 
  vector<u64> row_set; row_set.reserve( n3rd * 3000 * ( N - 2 ) );
  u64 vr[ nrows * ( N - 2 ) ]; 
  for( int i = 0; i < nrows; i ++ ) vr[ i ] = rows[ i ];
  fprintf( stderr, "nrows %d  n3rd %d  \n", nrows, n3rd );
 
  MakeRows( N - 2, 0, rem, vr, vr + nrows, row_set );
  fprintf( stderr, "row_set.size() %7lu, %5lu sets/warp \n", 
                    row_set.size(), row_set.size() / ( N - 2 ) / totalwarps );
  int nbatch_ini = max( row_set.size() / ( N - 2 ) / totalwarps / 3, 1ul );
 
  for( int i = max_row + 1; i < nvecs; ++ i ) 
    if( Cross( max_row, i ) && Cross( second_row, i ) 
        && v_max >= Reverse( vecs[ i ] ) )
    {
       cols.push_back( vecs[ i ] );
    }
  int ncols = cols.size();
 
  for( int i = 0; i < nvecs; ++ i ) 
    if( Cross( max_row, i ) && Cross( second_row, i ) ) 
       diags.push_back( vecs[ i ] );
  int ndiags = diags.size();
 
  /* constant parameters for the __global__ RowSet()  */
 
  u64 rev_max = Reverse( v_max );
  unsigned rev_max32 = unsigned( rev_max );
  bool single = rev_max == v_max or rev_max == v_second;
  int nrowset = row_set.size();
  int vcsize = vcsize_f( ncols, N );
  int vxsize = vxsize_f( ndiags, N );
 
  // Device Memory Allocation -------------------
 
  int ndev = dev_blocks.size();
 
  u64* row_set_d[ ndev ]; 
  u64* vc_d[ ndev ]; 
  u64* vx_d[ ndev ];
 
  i_rowset_m = 0;  // initialize managed variable
  // count_m = 0ULL; // Don't initialize here.
 
  for( int idev = 0 ; idev < ndev; idev ++ ) {
 
    CU_SAFE( cudaSetDevice( idev ) );
 
    int nblocks = dev_blocks[ idev ];
 
    CU_SAFE( cudaMalloc( row_set_d + idev, sizeof( u64 ) * row_set.size() ) );
    CU_SAFE( cudaMemcpy( row_set_d[ idev ], row_set.data(), 
                sizeof( u64 ) * row_set.size(), cudaMemcpyHostToDevice ) );
 
    CU_SAFE( cudaMalloc( vc_d + idev, 
                sizeof(u64) * ( RUWS( ncols )  + vcsize * nblocks * NW ) ) );
    CU_SAFE( cudaMalloc( vx_d + idev, 
                sizeof(u64) * ( RUWS( ndiags ) + vxsize * nblocks * NW ) ) );
    CU_SAFE( cudaMemcpy( vc_d[ idev ], cols.data(), ncols*sizeof(u64), 
                cudaMemcpyHostToDevice ) );
    CU_SAFE( cudaMemcpy( vx_d[ idev ], diags.data(), ndiags*sizeof(u64), 
                cudaMemcpyHostToDevice ) );
 
    // constant memory setup --------------------
 
    CU_SAFE( cudaMemcpyToSymbol( v_max_c, &v_max, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( v_max32_c, &v_max32, sizeof(unsigned) ) );
    CU_SAFE( cudaMemcpyToSymbol( v_second_c, &v_second, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( rev_max_c, &rev_max, sizeof( u64 ) ) );
    CU_SAFE( cudaMemcpyToSymbol( rev_max32_c, &rev_max32, sizeof(unsigned) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( single_c, &single, sizeof( bool ) ) );
    CU_SAFE( cudaMemcpyToSymbol( nrowset_c, &nrowset, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( ncols_c, &ncols, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( ndiags_c, &ndiags, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( vcsize_c, &vcsize, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( vxsize_c, &vxsize, sizeof( int ) ) );
 
    CU_SAFE( cudaMemcpyToSymbol( nbatch_c, &nbatch_ini, sizeof( int ) ) );
    CU_SAFE( cudaMemcpyToSymbol( totalwarps_c, &totalwarps, sizeof( int ) ) );
 
    // ------------------------------------------
 
    RowSet <<< nblocks, dim3( WS, NW ) >>> ( 
                  row_set_d[ idev ], vc_d[ idev ], vx_d[ idev ] );
  }
 
  for( int idev = 0 ; idev < ndev; idev ++ ) {
 
    CU_SAFE( cudaSetDevice( idev ) );
    // cudaDeviceSynchronize();
 
    // Device Memory Deallocation ----------------
 
    CU_SAFE( cudaFree( vx_d[ idev ] ) ); 
    CU_SAFE( cudaFree( vc_d[ idev ] ) );  
    CU_SAFE( cudaFree( row_set_d[ idev ] ) );
  }
 
  return; 
}
 
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res )
{
  static u64 row_vecs[ N ];
 
  u64 must_bit = ( 1ULL <<  topbit( rem ) );
 
  for(   ; vrb <  vre; vrb += NSMPL ) {
 
     u64 row_vec = *vrb;
     if( ! ( row_vec & must_bit ) ) break;
 
     row_vecs[ l ] = row_vec;
 
     if( m > l + 1 ) {
        u64* vrp = vrb + 1;
        u64* vrq = vre;
        while( vrp < vre ) {
           *vrq = *vrp;
           vrq += ( row_vec & *vrp ) ? 0 : 1;
           vrp ++;
        }
        MakeRows( m, l + 1, rem ^ row_vec, vre, vrq, res );
     } else {
        for( int i = 0; i < m; i ++ ) {
           res.push_back( row_vecs[ i ] );
        }
     }
  }
}
 
__device__ int gather_para( 
      const u64* __restrict__ const vp, const int & bv, const int & ev,
      const u64 & x, u64* __restrict__ const & vpn, 
      const int must_bit, int & nvn ) 
{
   int j = 0;
   nvn = 0;
#pragma unroll 1
   for( int i = /*RDWS*/(bv); i < ev; i += WS ) {
      u64 v = -1ULL;
      if( i + threadIdx.x < ev ) {
         v = vp[ i + threadIdx.x ];
      }
      bool p = ! ( v & x );
      bool nesp = p and ( v >> must_bit );
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      nvn += __popc( __ballot_sync( AllLanes, nesp ) );
      if( p ) {
         vpn[ j + sub_sum ] = v;
      }
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int cnext( const u64* __restrict__ vp, 
              const int & nv, const int must_bit )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      bool p = false;
      if( i + threadIdx.x < nv ) {
         p = vp[ i + threadIdx.x ] >> must_bit;
      }
      j += __popc( __ballot_sync( AllLanes, p ) );
      if( j != i + WS ) break;
   }
   return j;
}
 
__device__ int gather_cross( 
       const u64* __restrict__ const & vp, const int & nv, 
       const u64 & x, u64* __restrict__ const & vpn )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      u64 v = 0ULL;
      if( i + threadIdx.x < nv ) 
          v = vp[ i + threadIdx.x ];
      bool p = __popcll( v & x ) == 1; 
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int gather_double_cross( 
        const u64* __restrict__ const & vp, const int & nv, 
        const u64 & x1, const u64 x2, u64* __restrict__ const & vpn )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      u64 v = 0ULL;
      if( i + threadIdx.x < nv ) 
          v = vp[ i + threadIdx.x ];
      bool p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1;
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ int gather_triple_cross( 
     const u64* __restrict__ const & vp, const int & nv, 
     const u64 & x1, const u64 & x2, const u64 & x3, 
     u64* __restrict__ const & vpn )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      int i_t = i + threadIdx.x;
      u64 v = 0ULL;
      if( i_t < nv ) v = vp[ i_t ];
      bool p = __popcll( v & x1 ) == 1 
           and __popcll( v & x2 ) == 1
           and __popcll( v & x3 ) == 1;
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ void set_vecr( u64 vecs[ ], int index[ ], 
                             const u64 & v, const int & level ) 
{
  vecs[ level ] = v;
  if( ( v >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - threadIdx.x ] = level;
  }
  if( ( ( v >> WS ) >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - ( WS + threadIdx.x ) ] = level;
  }
  __syncwarp();
}
 
__device__ void set_vecc( u64 index[ ], const u64 & v ) 
{
  if( ( v >> threadIdx.x ) & 1 )  {
     index[ threadIdx.x ] = v;
  }
  if( ( ( v >> WS ) >> threadIdx.x ) & 1 )  {
     index[ WS + threadIdx.x ] = v;
  }
  __syncwarp();
}
 
__global__ void RowSet( u64* rowset, u64* vc_g, u64* vx_g )
{
  __shared__ u64 row_vecs[ NW ][ N ]; 
  #define ROW_VECS row_vecs[ threadIdx.y ]
 
  #define WHICH_ROW which_row[ threadIdx.y ]
 
  set_vecr( ROW_VECS, WHICH_ROW, v_max_c, N - 1 );
  set_vecr( ROW_VECS, WHICH_ROW, v_second_c, N - 2 );
 
  __shared__ u64* vc_s[ NW ];
  __shared__ u64* vx_s[ NW ];
  #define VC vc_s[ threadIdx.y ]
  #define VX vx_s[ threadIdx.y ]
 
  VC = vc_g + RUWS( ncols_c )  + vcsize_c * ( blockIdx.x * NW + threadIdx.y );
  VX = vx_g + RUWS( ndiags_c ) + vxsize_c * ( blockIdx.x * NW + threadIdx.y );
 
  VC[ vcsize_c - WS ] = 0ULL;
  VX[ vxsize_c - WS ] = 0ULL;
 
  u64 count = 0ULL;
  u64 sm_count = 0ULL;
 
  __shared__ int nbatch_s[ NW ];
  #define NBATCH nbatch_s[ threadIdx.y ]
  NBATCH = nbatch_c;
 
  for(   ; /* i_rowset < nrowset_c */; /* i_rowset += N - 2 */ ) {
 
     __shared__ int i_rowbat[ NW ];
     #define I_ROWBAT i_rowbat[ threadIdx.y ]
 
     if( threadIdx.x == 0 ) {
        I_ROWBAT = atomicAdd_system( &i_rowset_m, ( N - 2 ) * NBATCH );
     }
     __syncwarp();
 
     if( I_ROWBAT >= nrowset_c ) break;
 
     __shared__ int i_set[ NW ];
     #define I_SET i_set[ threadIdx.y ]
     I_SET = 0; 
     while( I_SET < NBATCH and I_ROWBAT + I_SET * ( N - 2 ) < nrowset_c ) {
 
       __shared__ bool single[ NW ];
       #define SINGLE single[ threadIdx.y ]
       SINGLE = single_c;
       for( int i = 0; i < N - 2; i ++ ) {
          u64 row_vec = rowset[ I_ROWBAT + ( I_SET * ( N - 2 ) ) + i ];
          set_vecr( ROW_VECS, WHICH_ROW, row_vec, N - 3 - i );
          SINGLE = SINGLE or rev_max_c == row_vec;
       }
 
       __shared__ int kvc_s[ NW ];
       __shared__ int kvx_s[ NW ];
       #define KVC kvc_s[ threadIdx.y ]
       #define KVX kvx_s[ threadIdx.y ]
 
#if N == 6
       KVC = gather_triple_cross( vc_g, ncols_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VC );
       KVX = gather_triple_cross( vx_g, ndiags_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VX );
#endif
#if N == 5
       KVC = gather_double_cross( vc_g, ncols_c, 
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VC );
       KVX = gather_double_cross( vx_g, ndiags_c,
           ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VX );
#endif
#if N == 4
       KVC = gather_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], VC );
       KVX = gather_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], VX );
#endif
#if N == 3
       for( int i = 0; i < ncols_c; i++ ) VC[ i ] = vc_g[ i ];
       for( int i = 0; i < ndiags_c; i++ ) VX[ i ] = vx_g[ i ];
       KVC = ncols_c;
       KVX = ndiags_c;
#endif
 
       for( int iw = 0; iw < KVX; iw += WS ) {
         if( iw + threadIdx.x < KVX ) {
            u64 v = VX[ iw + threadIdx.x ];
            VX[ iw + threadIdx.x ]  |=
              u64 (    ( 63 - __clzll ( v & ROW_VECS[ 0 ] ) )
                   | ( ( 63 - __clzll ( v & ROW_VECS[ 1 ] ) ) <<  8 )
                   | ( ( 63 - __clzll ( v & ROW_VECS[ 2 ] ) ) << 16 ) 
                  ) << 40;
         }
       }
       __syncwarp();
 
#if N > 3
       int sm_count_cols = 0;
       count += MakeCols( VC, KVC, VX, KVX, SINGLE, sm_count_cols ) ;
       assert( ( sm_count_cols >> CINTLIM ) == 0 );
       sm_count += sm_count_cols;
#else
       u64 sub_count = 0;
       int sm_count_cols = 0;
       int nvn = cnext( VC, KVC, NN - 1 );
       LastCols( AllBits, VC, nvn, KVC - nvn, VX, KVX, SINGLE, 
                                         sub_count, sm_count_cols );
       count += sub_count;
       sm_count += sm_count_cols;
#endif
       #if N > 5
       #ifdef V
         if( blockIdx.x == 0 and threadIdx.x == 0 and threadIdx.y == 0 ) {
            printf( "r3:%09llx %9llu %9llu %5d\n", 
                 ROW_VECS[ N - 3 ], count, sm_count, NBATCH );
         }
       #endif
       #endif
 
       if( threadIdx.x == 0) I_SET++;
       __syncwarp();
    }
    if( threadIdx.x == 0 ) 
        NBATCH = max( (nrowset_c - i_rowset_m)/(N - 2)/totalwarps_c/3, 1 );
    __syncwarp();
  }
  if ( VC[ vcsize_c - WS ] != 0ULL or
       VX[ vxsize_c - WS ] != 0ULL ) {
     printf( " Array over run ! \n");
     assert( 0 );
     // exit( 1 );
  }
 
  for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) {
     count += __shfl_xor_sync( AllLanes, count, span );
  }
  if( threadIdx.x == 0 ) 
  {
     atomicAdd_system( &count_m, count );
     atomicAdd_system( &sm_count_m, sm_count );
  }
}
 
#if N > 3
__device__ u64 MakeCols( u64* __restrict__ const & vcb, const int & nvc, 
                         u64* __restrict__ const & vxb, const int & nvx, 
                         const bool & single, int & sm_count)
{
  __shared__ u64  rem_s[ NW ][ N - 3 ];
  __shared__ u64* vcb_s[ NW ][ N - 3 ];
  __shared__ int  nvc_s[ NW ][ N - 3 ]; 
  __shared__ int  nvn_s[ NW ][ N - 3 ];
  __shared__ u64* vxb_s[ NW ][ N - 3 ]; 
  __shared__ int  nvx_s[ NW ][ N - 3 ]; 
  __shared__ bool  single_s[ NW ][ N - 3 ]; 
 
  #define REM_S rem_s[ threadIdx.y ]
  #define VCB_S vcb_s[ threadIdx.y ]
  #define NVC_S nvc_s[ threadIdx.y ]
  #define NVN_S nvn_s[ threadIdx.y ]
  #define VXB_S vxb_s[ threadIdx.y ]
  #define NVX_S nvx_s[ threadIdx.y ]
  #define SINGLE_S single_s[ threadIdx.y ]
 
  int lvl = N - 4;
  REM_S[ lvl ] = AllBits;//rem;
  VCB_S[ lvl ] = vcb;
  NVC_S[ lvl ] = nvc;
  NVN_S[ lvl ] = cnext( vcb, nvc, NN - 1 );
  VXB_S[ lvl ] = vxb;
  NVX_S[ lvl ] = nvx;
  SINGLE_S[ lvl ] = single;
 
  __shared__ int  ivc_s[ NW ][ N - 3 ];
  #define IVC_S ivc_s[ threadIdx.y ]
 
  u64 sub_count = 0;
  // int sm_count = 0;
 
  Init:
 
     IVC_S[ lvl ] = 0;
 
 //  for(  ; ivc_s[ lvl ] < nvn_s[ lvl ]; ivc_s[ lvl ] ++  ) {
 
  Loop:
 
     if( IVC_S[ lvl ] >= NVN_S[ lvl ] ) {  // Recursive return
        if( lvl == N - 4 ) 
           return sub_count;
        else {
           lvl ++;
           goto ReInit; 
        }
     }
 
{ // Scope limit for locals.
 
     __shared__ u64 col_vec[ NW ];
     #define COL_VEC col_vec[ threadIdx.y ]
     COL_VEC  = VCB_S[ lvl ][ IVC_S[ lvl ] ];
 
     __shared__ u64* vxbn[ NW ];
     #define VXBN vxbn[ threadIdx.y ]
     VXBN = RUWS( NVX_S[ lvl ] ) + VXB_S[ lvl ];
 
     __shared__ int kvx[ NW ];
     #undef KVX
     #define KVX kvx[ threadIdx.y ]
     KVX = gather_cross( VXB_S[ lvl ], NVX_S[ lvl ], COL_VEC, VXBN ); 
 
     __shared__ u64* vcbn[ NW ];
     #define VCBN vcbn[ threadIdx.y ]
     VCBN = RUWS( NVC_S[ lvl ] ) + VCB_S[ lvl ];
 
     __shared__ int kvc[ NW ];
     #undef KVC
     #define KVC kvc[ threadIdx.y ]
     __shared__ int kvn[ NW ];
     #define KVN kvn[ threadIdx.y ]
 
     KVC = gather_para( VCB_S[ lvl ], NVN_S[ lvl ], NVC_S[ lvl ],
                  COL_VEC, VCBN, topbit( REM_S[ lvl ] ^ COL_VEC ), KVN );  
 
     set_vecc( which_colv[threadIdx.y], COL_VEC );
 
     if( lvl > 0 ) {
	// Recursive call
        REM_S[ lvl - 1 ] = REM_S[ lvl ] ^ COL_VEC;
        VCB_S[ lvl - 1 ] = VCBN;
        NVC_S[ lvl - 1 ] = KVC;
 
        NVN_S[ lvl - 1 ] = KVN;
        VXB_S[ lvl - 1 ] = VXBN;
        NVX_S[ lvl - 1 ] = KVX;
        SINGLE_S[ lvl - 1 ] = SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC );
 
        lvl --;
        goto Init;
 
     } else {  // 31 / 42
 
        LastCols( unsigned( REM_S[ lvl ] ^ COL_VEC ), VCBN, KVN, KVC - KVN,
             VXBN, KVX, SINGLE_S[ lvl ] or ( rev_max_c == COL_VEC ), 
             sub_count, sm_count );
 
     }
} // Scope limit for locals.
 
  ReInit:
    IVC_S[ lvl ] ++;
    goto Loop;
}
#endif // N > 3
 
__device__ void LastCols( unsigned rem, 
       const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, 
       u64* __restrict__ const & vxb, const int & nvx, 
       const bool single, u64 & sub_count, int & sm_count )
{
 
  unsigned thread_bit = 1U << threadIdx.x;
  u64* thread_ep =  & which_colv[ threadIdx.y ] [ threadIdx.x ];
/*
  __shared__ int ic21w_s[ NW ];
  #define IC21W ic21w_s[ threadIdx.y ]
  __shared__ int nc21_s[ NW ];
  #define NC21 nc21_s[ threadIdx.y ]
*/
 
  int nc21 = nvn * ( nvcmn - 1 );
  for( int ic21w = 0; ic21w < nc21; ic21w += WS ) {
 
    bool v1ep = false;
    bool psingl = single;
 
    if( ic21w + threadIdx.x < nc21 ) {
      u64 v2 = vcb[ ( ic21w + threadIdx.x ) / ( nvcmn - 1 ) ];
      unsigned v1 = 
          unsigned( vcb[ nvn + ( ic21w + threadIdx.x ) % ( nvcmn - 1 ) ] );
      unsigned v0 = rem ^ unsigned( v2 ) ^ v1;
      v1ep = ( v1 >> topbit( rem ^ unsigned( v2 ) ) ) 
               and ( ( v1 & unsigned(v2) ) == 0 ) 
               and ( v_max32_c >= __brev(v0) );
      psingl = psingl 
                 or rev_max32_c == v2
                 or rev_max32_c == v1 
                 or rev_max32_c == v0 ;
    }
    unsigned v1e = __brev( __ballot_sync( AllLanes, v1ep ) );
    unsigned dbl = __ballot_sync( AllLanes, ! psingl );
 
    int ic21 = 0;
    while( v1e ) {
 
      int skip = __clz( v1e );
      ic21 += skip;
 
      u64 v2 = vcb[ ( ic21w + ic21 ) / ( nvcmn - 1 ) ];
      unsigned v1 = unsigned( vcb[ nvn + ( ic21w + ic21 ) % ( nvcmn - 1 ) ] );
 
      dbl >>= ( skip );
      v1e <<= ( skip + 1 );
      ic21 ++;
 
      if( unsigned( v2 ) & thread_bit ) *thread_ep = v2;
      if( unsigned( v2 >> WS ) & thread_bit ) *( thread_ep + WS ) = v2;
      if( ( rem ^ unsigned( v2 ) ) & thread_bit ) {
         *thread_ep 
           = u64( v1 ^ ( ( v1 & thread_bit ) ? 0 : ( rem ^ unsigned( v2 ) ) ) );
         // __syncwarp(); // synced in the gather_double_cross below.
      }
 
      u64* vxbn = RUWS( nvx ) + vxb;
      int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn );// 8/ 42
 
      int shift = ( dbl & 1 );
      dbl >>= 1;
 
      // avr( kvx ) : 8.6,  27% : > 10,  0.5% : > 20,  0.0005% : > 30
      while( kvx > MDIAG ) { // 14 / 42
        sub_count += MakeDiagL( vxbn, kvx ) << shift;
        vxbn ++;
        kvx --;
      }
      sub_count += MakeDiag( vxbn, kvx ) << shift;
/*
      if( kvx < 9 )
        sub_count += MakeDiag( vxbn, kvx ) << shift;
      else
        sub_count += MakeDiagC( vxbn, kvx ) << shift;
*/
      sm_count += ( 1 << shift );
 
       __syncwarp(); // !!
         // necessary to prevent shared data from being updated too early.
    } 
  }
  return;
}
 
__device__ 
int MakeDiagL( const u64* __restrict__ const & vxb,  const int & nvx ) 
{
   int  diag_count = 0;
   u64 vdx = ( *vxb ) & ( ( 1ULL << 40 ) - 1 );
#pragma unroll 1
   for( int ic = 1; ic < nvx; ic += WS ) {
     if( ic + threadIdx.x < nvx ) {
       u64 vcx = vxb[ ic + threadIdx.x ];
#if N%2
       if( __popcll( vdx & vcx ) == 1 ) 
#else
       if( ( vdx & vcx )  == 0 ) 
#endif
       {  // 4 - 10 / 42
          if( FinalCheck( *vxb, vcx ) ) diag_count ++;
       }
     }
   }
   return diag_count;
}
 
__device__ 
int MakeDiag( const u64* __restrict__ const & vx,  const int & vxn ) 
{
   int  diag_count = 0;
#pragma unroll 1
   for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) {
      if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) {
         unsigned int idx  = p_idx_c[ ivx + threadIdx.x ];
         u64 vdx = vx[ idx >> 16 ];
         u64 vcx = vx[ idx & 0xffff ];
#if N%2
         if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 ) 
#else
         if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) )  == 0 ) 
#endif
         {   // 4-10 / 42
            if( FinalCheck( vdx, vcx ) ) diag_count ++;
         }
      }
   }
   return diag_count;
}
 
/*
__device__ 
int MakeDiagC( const u64* __restrict__ const & vx,  const int & vxn ) 
{
   __shared__ unsigned int idx_s[ NW ][ 2*WS ];
   #define IDX idx_s[ threadIdx.y ]
   #define IDXMSK ( 2*WS - 1 )
 
   int np = 0;
   int ib = 0;
   int ie = 0;
 
   int diag_count = 0;
#pragma unroll 1
   for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) {
      bool p = false;
      unsigned int idx;
      if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) {
         idx = p_idx_c[ ivx + threadIdx.x ];
         u64 vdx = vx[ idx >> 16 ];
         u64 vcx = vx[ idx & 0xffff ];
#if N%2
         p = __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1; 
#else
         p = ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) )  == 0;
#endif
     }
     unsigned ballot = __ballot_sync( AllLanes, p );
     int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
     if( p )
         IDX[ ( ie + sub_sum ) & IDXMSK ] = idx;
     int nt = __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
     ie += nt;
     np += nt;
     if( np >= WS ) {   
        int idx = IDX[ ( ib + threadIdx.x ) & IDXMSK ];
        if( FinalCheck( vx[ idx >> 16 ], vx[ idx & 0xffff ] ) ) diag_count ++;
        np -= WS;
        ib += WS;
     }
   }
   if( threadIdx.x < np ) {   
      int idx = IDX[ ( ib + threadIdx.x ) & IDXMSK ];
      if( FinalCheck( vx[ idx >> 16 ], vx[ idx & 0xffff ] ) ) diag_count ++;
   }
   return diag_count;
}
*/
 
__device__ bool FinalCheck( const u64 & vdx, const u64 & vcx )
{
   int* e2row = which_row[ threadIdx.y ] + NN - 64;
   u64* e2colv = which_colv[ threadIdx.y ];
   int i;
   i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) ];
   if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) ] != i ) 
       return false;
 
   i = 40 + ( ( ( i & 1 ) + 1 ) << 3 );
 
   return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) ] 
            ==
            e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) ] ) ;
}
 
#ifndef noMD5
void MakeMD5( u64 max_p, u64 second_p, u64 count, char md5str[] )
{
   MD5_CTX md5ctx;
   unsigned char md5bin[ MD5_DIGEST_LENGTH ];
   const char* sample = "e00000007 1c0000038    494199069345";
   int msg_len = strlen( sample );
   char msg[ msg_len + 1 ];
   sprintf( msg, "%09llx %09llx %15llu", max_p, second_p, count );
 
   MD5_Init( &md5ctx );
   MD5_Update( &md5ctx, msg, msg_len );
   MD5_Final( md5bin, &md5ctx );
 
   for( int i = 0; i < MD5_DIGEST_LENGTH; i ++ )
     sprintf( md5str + ( i * 2 ), "%02x", md5bin[ i ] );
   md5str[ MD5_DIGEST_LENGTH * 2 ] = 0;
}
#endif
 
int main( int argc, char* argv[] )
{
   fprintf( stderr, "%s\n", __FILE__ );
   fprintf( stderr, "Size of long long unsigned : %lu \n", sizeof( u64 ) );
   if( sizeof( u64 ) * 8 < NN ) {
      fprintf( stderr, 
               "Size of long long is too small : %lu \n", sizeof( u64 ) );
      exit( 1 );
   }
   if( check_topbit( ) ) {
      fprintf( stderr, "topbit() does't work correctly on this platform.\n" );
      exit( 1 );
   }
 
   GenVecs( NN, ( ( NN + 1 ) * N ) / 2, N - 1, 0ULL );
   fprintf( stderr, "nvecs %d\n", nvecs );
 
   /* pair indexes ----------------------------------- */
 
   int p_offset[ MDIAG + 2 ];
   int p_idx[ MDPAIR ];
 
   int ioff = 0;
   for( int ndiag = 0; ndiag < MDIAG + 1; ndiag ++ ) {
 
      p_offset[ ndiag ] = ioff;
      for( int idx = 0; idx < ndiag - 1; idx ++ ) {
         for( int icx = idx + 1; icx < ndiag; icx ++ ) {
             p_idx[ ioff ++ ] = ( idx << 16 ) | icx;
         }
      }
   }
   p_offset[ MDIAG + 1 ] = ioff;
 
   fprintf( stderr, "size of p_idx_c %d \n", ioff );
   if( ioff > MDPAIR ) {
     fprintf(stderr, "p_idx_c index out of ragne. %d >= %d\n", ioff, MDPAIR );
     exit( 1 );
   }
 
   /* ------------------------------------------------ */
   // Get device properties
   // Set a part of constant memory
   //
   int ndev;
   CU_SAFE( cudaGetDeviceCount( &ndev ) );
   vector<int> dev_blocks( ndev );
   totalwarps = 0;
   for( int idev = 0 ; idev < ndev; idev ++ ) {
     CU_SAFE( cudaSetDevice( idev ) );
     cudaDeviceProp prop;
     CU_SAFE( cudaGetDeviceProperties( &prop, idev ) );
     int nblocks = prop.multiProcessorCount * NB_SM;
     dev_blocks[ idev ] = nblocks;
     fprintf( stderr, "DevId:%2d  #SM:%d, #Blocks:%d %s\n", 
                      idev, prop.multiProcessorCount, nblocks, prop.name);
 
     CU_SAFE( cudaMemcpyToSymbol( 
             p_offset_c, p_offset, sizeof( int ) * ( MDIAG + 2 ) ) );
     CU_SAFE( cudaMemcpyToSymbol( p_idx_c, p_idx, sizeof( int ) * ioff ) );
     totalwarps += nblocks * NW;
   }
   fprintf( stderr, "Total warps: %5d\n", totalwarps );
 
   // 
   u64 row1P = strtoll( argv[ 2 ], NULL, 16 );
   u64 row2P = strtoll( argv[ 4 ], NULL, 16 );
   fprintf( stderr, "r1:%09llx r2:%09llx\n", row1P, row2P );
 
   u64 total_count = 0ULL;
   u64 total_sm_count = 0ULL;
 
   for( int max_row = 0; max_row < nvecs; ++ max_row ) {
     u64 max_rowP = vecs[ max_row ];
     if( row1P != 0ULL and row1P < max_rowP ) continue;
     if( max_rowP < row1P ) break;
     if( ! ( max_rowP & 1ULL << ( NN - 1 ) ) ) break;
     if( max_rowP < Reverse( max_rowP ) ) continue;
     for( int second_row = max_row + 1; second_row < nvecs; ++ second_row ) {
         u64 second_rowP = vecs[ second_row ];
         if( row2P != 0ULL and row2P < second_rowP ) continue;
         if( second_rowP < row2P ) break;
         if( max_rowP & second_rowP ) continue;
         if( max_rowP < Reverse( second_rowP ) ) continue;
         u64 rem = AllBits ^ max_rowP ^ second_rowP;
         if( rem > second_rowP ) break;
 
         count_m = 0ULL;
         sm_count_m = 0ULL;
 
         Count( max_row, second_row, dev_blocks );
 
         printf( "%05d %09llx ", max_row, vecs[ max_row ] );
         printf( "%05d %09llx ", second_row, vecs[ second_row ] );
         printf( "%15llu ", count_m ); 
         #ifndef noMD5
            char md5str[ MD5_DIGEST_LENGTH * 2 + 1 ];
            MakeMD5( vecs[ max_row ], vecs[ second_row ], count_m, md5str );
            printf( "%s", md5str );
         #endif
         printf( "\n");
 
         printf( "%05d %09llx ", max_row, vecs[ max_row ] );
         printf( "%05d %09llx ", second_row, vecs[ second_row ] );
         printf( "S %13llu ", sm_count_m ); 
         #ifndef noMD5
            MakeMD5( vecs[ max_row ], vecs[ second_row ], sm_count_m, md5str );
            printf( "%s", md5str );
         #endif
         printf( "\n");
 
         total_count += count_m;
         total_sm_count += sm_count_m;
     }
   }
   if( row2P == 0ULL ) {
       fprintf( stderr, "Total: %17llu\n", total_count );
       fprintf( stderr, "Total_SM:%15llu\n", total_sm_count );
   }
   return 0;
}