/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)
	2012, 2014  Serge Batalov  (limb extensions)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static void mul_256_F192_223_initial_special(int256 *res, int256 a, int256 b)
/* res = a * b (only lower 256 bits of the result), a.d1-d7 are zero, b.d1-d5 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0, %8;\n\t"          /* (a.d0 * b.d0).lo */
      "mov.u32         %1, %9;\n\t"          /* (a.d1 * b.d0).lo */
      "mov.u32         %2, %10;\n\t"         /* (a.d2 * b.d0).lo */
      "mov.u32         %3, 0;\n\t"
      "mov.u32         %4, 0;\n\t"
      "mov.u32         %5, 0;\n\t"
      "mul.lo.u32      %6, %8, %22;\n\t"     /* (a.d0 * b.d6).lo */
      "mul.lo.u32      %7, %8, %23;\n\t"     /* (a.d0 * b.d7).lo */

      "mad.hi.u32      %7, %8, %22, %7;\n\t" /* (a.d0 * b.d6).hi */

      "mad.lo.u32      %7, %9, %22, %7;\n\t" /* (a.d1 * b.d6).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6), "r" (b.d7));
}


__device__ static void mul_256_F192_223_special(int256 *res, int256 a, int256 b)
/* res = a * b (only lower 256 bits of the result), b.d1-d5 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %8;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mov.u32         %1,          %9;\n\t"  /*                    (a.d1 * b.d0).lo */
      "mov.u32         %2,          %10;\n\t" /*                    (a.d2 * b.d0).lo */
      "mov.u32         %3,          %11;\n\t" /*                    (a.d3 * b.d0).lo */
      "mov.u32         %4,          %12;\n\t" /*                    (a.d4 * b.d0).lo */
      "mov.u32         %5,          %13;\n\t" /*                    (a.d5 * b.d0).lo */
      "mad.lo.cc.u32   %6, %8, %22, %14;\n\t" /* (a.d0 * b.d6).lo + (a.d6 * b.d0).lo */
      "madc.lo.u32     %7, %8, %23, %15;\n\t" /* (a.d0 * b.d7).lo + (a.d7 * b.d0).lo */

      "mad.hi.u32      %7, %8, %22, %7;\n\t"  /* (a.d0 * b.d6).hi */

      "mad.lo.u32      %7, %9, %22, %7;\n\t"  /* (a.d1 * b.d6).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6), "r" (b.d7));
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 192 <= N <= 223.  Works on f between 225 and 236 (235?!) bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett236_F192_223gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int256 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int256 a, u, tmp256;
    int480 b, tmp480;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = 0;
    f.d5 = 0;
    f.d6 = (k.d0 << (exp - 192));
    f.d7 = (k.d1 << (exp - 192)) + (k.d0 >> (32 - (exp - 192)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d7);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d6);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp480.d14 = 0xFFFFFFFF;				// tmp480 is nearly 2^480
    tmp480.d13= 0xFFFFFFFF; tmp480.d12= 0xFFFFFFFF; tmp480.d11= 0xFFFFFFFF; tmp480.d10= 0xFFFFFFFF;
    tmp480.d9 = 0xFFFFFFFF; tmp480.d8 = 0xFFFFFFFF; tmp480.d7 = 0xFFFFFFFF; tmp480.d6 = 0xFFFFFFFF; tmp480.d5 = 0xFFFFFFFF;
    tmp480.d4 = 0xFFFFFFFF; tmp480.d3 = 0xFFFFFFFF; tmp480.d2 = 0xFFFFFFFF; tmp480.d1 = 0xFFFFFFFF; tmp480.d0 = 0xFFFFFFFF;

    // Could write optimized div_480_256 with so many tmp480 elements known to be zero
    div_480_256(&u,tmp480,f,ff);			// u = floor(2^480 / f).  This requires f >= 225 bits.

							// b_preinit = 2^256
							// a = b_preinit / 2^224 = 2^32
							// tmp480 = a * u = (b_preinit / 2^224) * (2^480 / f)     (ignore the floor functions for now)
    a.d0 = u.d7;					// a = tmp480 / 2^256, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 = 0;
    a.d2 = 0;

    mul_256_F192_223_initial_special(&tmp256, a, f);	// tmp256 = quotient * f, we only compute the low 256-bits here

    a.d0 = __sub_cc (0, tmp256.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp256.d1);			// we do not need the upper digits of b_preinit and tmp256 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp256.d2);
    a.d3 = __subc_cc(0, tmp256.d3);
    a.d4 = __subc_cc(0, tmp256.d4);
    a.d5 = __subc_cc(0, tmp256.d5);
    a.d6 = __subc_cc(0, tmp256.d6);
    a.d7 = __subc   (0, tmp256.d7);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 239.999 bits (see end of this loop)

      square_256_480(&b, a);				// b = a^2, b is at most 480.0 bits

      tmp256.d0 = b.d7;					// a = b / 2^224, a is at most 255.999 bits
      tmp256.d1 = b.d8;
      tmp256.d2 = b.d9;
      tmp256.d3 = b.d10;
      tmp256.d4 = b.d11;
      tmp256.d5 = b.d12;
      tmp256.d6 = b.d13;
      tmp256.d7 = b.d14;

      mul_256_512_no_low8(&a, tmp256, u);		// a = (b / 2^224) * (2^480 / f) / 2^256    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 16.  A full mul_256_512 would add 14 partial results
							// into tmp512.d7 which could have generated 14 carries into tmp512.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d7 could have been added into
							// tmp512.d7 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d7 could have been added into tmp512.d7 possibly generating a carry.
							// A grand total of up to 16 carries lost.

      mul_256_F192_223_special(&tmp256, a, f);		// tmp256 = quotient * f, we only compute the low 256-bits here

      a.d0 = __sub_cc (b.d0, tmp256.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp256.d1);		// we do not need the upper digits of b and tmp256 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp256.d2);
      a.d3 = __subc_cc(b.d3, tmp256.d3);
      a.d4 = __subc_cc(b.d4, tmp256.d4);
      a.d5 = __subc_cc(b.d5, tmp256.d5);
      a.d6 = __subc_cc(b.d6, tmp256.d6);
      a.d7 = __subc   (b.d7, tmp256.d7);
							// Since the quotient was up to 16 too small, the remainder has a maximum value of 15*f,
							// or 236 bits + log2 (15) bits, which is 239.999 bits.
    }

    mod_simple_256(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 17 times f.

#if 0
    if(cmp_ge_256(finalrem,f) && f.d7)
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_256_512_no_low8(&tmp256,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", f.d7, f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X %X\r\n", u.d7, u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X %X\r\n", tmp256.d7, tmp256.d6, tmp256.d5, tmp256.d4, tmp256.d3, tmp256.d2, tmp256.d1, tmp256.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", finalrem.d7, finalrem.d6, finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d5 == 0 && finalrem.d4 == 0 && finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d6 == f.d6 && finalrem.d0 == 0 && finalrem.d7 == f.d7) ||
       (finalrem.d6 == 0    && finalrem.d0 == 1 && finalrem.d7 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=8;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*8 + 2]=1;
	RES[index*8 + 3]=0;
	RES[index*8 + 4]=0;
	RES[index*8 + 5]=0;
	RES[index*8 + 6]=0;
	RES[index*8 + 7]=0;
	RES[index*8 + 8]=f.d6;
	RES[index*8 + 9]=f.d7;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 8);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=f.d7;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9] =finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=finalrem.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+15]=finalrem.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+16]=finalrem.d7;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 192 <= N <= 223.  Works on f between 237 and 247 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett247_F192_223gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int256 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int256 a, u, tmp256;
    int512 b, tmp512;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = 0;
    f.d5 = 0;
    f.d6 = (k.d0 << (exp - 192));
    f.d7 = (k.d1 << (exp - 192)) + (k.d0 >> (32 - (exp - 192)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d7);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d6);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp512.d15 = 1 << (bits_max - 1);			// tmp512 = 2^(255 + bits_in_f)
    tmp512.d14 = 0; tmp512.d13 = 0; tmp512.d12 = 0; tmp512.d11 = 0;
    tmp512.d10 = 0; tmp512.d9 = 0; tmp512.d8 = 0; tmp512.d7 = 0; tmp512.d6 = 0;
    tmp512.d5 = 0; tmp512.d4 = 0; tmp512.d3 = 0; tmp512.d2 = 0; tmp512.d1 = 0; tmp512.d0 = 0;

    // Could write optimized div_512_256 with so many tmp512 elements known to be zero
    div_512_256(&u,tmp512,f,ff);			// u = floor(2^(255 + bits_in_f) / f), giving 256 bits of precision, requires f >= 225 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp512 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (255 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp512 / 2^256, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d7 >> (bits_max - 1));
    a.d1 = 0;
    a.d2 = 0;

    mul_256_F192_223_initial_special(&tmp256, a, f);	// tmp256 = quotient * f, we only compute the low 256-bits here

    a.d0 = __sub_cc (0, tmp256.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp256.d1);			// we do not need the upper digits of b_preinit and tmp256 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp256.d2);
    a.d3 = __subc_cc(0, tmp256.d3);
    a.d4 = __subc_cc(0, tmp256.d4);
    a.d5 = __subc_cc(0, tmp256.d5);
    a.d6 = __subc_cc(0, tmp256.d6);
    a.d7 = __subc   (0, tmp256.d7);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 250.700 bits (see end of this loop)

      square_256_512(&b, a);				// b = a^2, b is at most 501.528 bits

      tmp256.d0 = (b.d7  >> (bits_max - 1)) + (b.d8  << (33 - bits_max)); // a = b / (2 ^ (bits_in_f - 1)), a is at most 255.528 bits
      tmp256.d1 = (b.d8  >> (bits_max - 1)) + (b.d9  << (33 - bits_max));
      tmp256.d2 = (b.d9  >> (bits_max - 1)) + (b.d10 << (33 - bits_max));
      tmp256.d3 = (b.d10 >> (bits_max - 1)) + (b.d11 << (33 - bits_max));
      tmp256.d4 = (b.d11 >> (bits_max - 1)) + (b.d12 << (33 - bits_max));
      tmp256.d5 = (b.d12 >> (bits_max - 1)) + (b.d13 << (33 - bits_max));
      tmp256.d6 = (b.d13 >> (bits_max - 1)) + (b.d14 << (33 - bits_max));
      tmp256.d7 = (b.d14 >> (bits_max - 1)) + (b.d15 << (33 - bits_max));

      mul_256_512_no_low8(&a, tmp256, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (255 + bits_in_f) / f) / 2^256   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 16.  A full mul_256_512 would add 15 partial results
							// into tmp512.d7 which could have generated 14 carries into tmp512.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d7 could have been added into
							// tmp512.d7 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d7 could have been added into tmp512.d7 possibly generating a carry.
							// A grand total of up to 16 carries lost.

      mul_256_F192_223_special(&tmp256, a, f);		// tmp256 = quotient * f, we only compute the low 256-bits here

      a.d0 = __sub_cc (b.d0, tmp256.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp256.d1);		// we do not need the upper digits of b and tmp256 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp256.d2);
      a.d3 = __subc_cc(b.d3, tmp256.d3);
      a.d4 = __subc_cc(b.d4, tmp256.d4);
      a.d5 = __subc_cc(b.d5, tmp256.d5);
      a.d6 = __subc_cc(b.d6, tmp256.d6);
      a.d7 = __subc   (b.d7, tmp256.d7);
							// Since the quotient was up to 16 too small, the remainder has a maximum value of 17*f,
							// or 247 bits + log2 (17) bits, which is 250.700 bits.  In theory, this kernel can handle
							// f values up to 2^247.028.
    }

    mod_simple_256(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_256(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_256_512_no_low8(&tmp256,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", f.d7, f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X %X\r\n", u.d7, u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X %X\r\n", tmp256.d7, tmp256.d6, tmp256.d5, tmp256.d4, tmp256.d3, tmp256.d2, tmp256.d1, tmp256.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", finalrem.d7, finalrem.d6, finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d5 == 0 && finalrem.d4 == 0 && finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d6 == f.d6 && finalrem.d0 == 0 && finalrem.d7 == f.d7) ||
       (finalrem.d6 == 0    && finalrem.d0 == 1 && finalrem.d7 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=8;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*8 + 2]=1;
	RES[index*8 + 3]=0;
	RES[index*8 + 4]=0;
	RES[index*8 + 5]=0;
	RES[index*8 + 6]=0;
	RES[index*8 + 7]=0;
	RES[index*8 + 8]=f.d6;
	RES[index*8 + 9]=f.d7;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 8);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=f.d7;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9] =finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=finalrem.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+15]=finalrem.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+16]=finalrem.d7;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 192 <= N <= 223.  Works on f between 248 and 249 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett249_F192_223gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int256 f, a;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int256 u, tmp256;
    int512 b, tmp512;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = 0;
    f.d5 = 0;
    f.d6 = (k.d0 << (exp - 192));
    f.d7 = (k.d1 << (exp - 192)) + (k.d0 >> (32 - (exp - 192)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d7);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d6);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp512.d15 = 1 << (bits_max - 1);			// tmp512 = 2^(255 + bits_in_f)
    tmp512.d14 = 0; tmp512.d13= 0; tmp512.d12= 0; tmp512.d11= 0;
    tmp512.d10 = 0; tmp512.d9 = 0; tmp512.d8 = 0; tmp512.d7 = 0; tmp512.d6 = 0;
    tmp512.d5 = 0; tmp512.d4 = 0; tmp512.d3 = 0; tmp512.d2 = 0; tmp512.d1 = 0; tmp512.d0 = 0;

    // Could write optimized div_512_256 with so many tmp512 elements known to be zero
    div_512_256(&u,tmp512,f,ff);			// u = floor(2^(255 + bits_in_f) / f), giving 256 bits of precision, requires f >= 225 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp512 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (255 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp512 / 2^256, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d7 >> (bits_max - 1));
    a.d1 = 0;
    a.d2 = 0;

    mul_256_F192_223_initial_special(&tmp256, a, f);	// tmp256 = quotient * f, we only compute the low 256-bits here

    a.d0 = __sub_cc (0, tmp256.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp256.d1);			// we do not need the upper digits of b_preinit and tmp256 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp256.d2);
    a.d3 = __subc_cc(0, tmp256.d3);
    a.d4 = __subc_cc(0, tmp256.d4);
    a.d5 = __subc_cc(0, tmp256.d5);
    a.d6 = __subc_cc(0, tmp256.d6);
    a.d7 = __subc   (0, tmp256.d7);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 249.858 bits (see end of this loop)

      square_256_512(&b, a);				// b = a^2, b is at most 499.716 bits

      a.d0 = (b.d7  >> (bits_max - 5)) + (b.d8  << (32 - (bits_max - 5))); // a = b / (2 ^ (bits_in_f - 5)), a is at most 255.716 bits
      a.d1 = (b.d8  >> (bits_max - 5)) + (b.d9  << (32 - (bits_max - 5)));
      a.d2 = (b.d9  >> (bits_max - 5)) + (b.d10 << (32 - (bits_max - 5)));
      a.d3 = (b.d10 >> (bits_max - 5)) + (b.d11 << (32 - (bits_max - 5)));
      a.d4 = (b.d11 >> (bits_max - 5)) + (b.d12 << (32 - (bits_max - 5)));
      a.d5 = (b.d12 >> (bits_max - 5)) + (b.d13 << (32 - (bits_max - 5)));
      a.d6 = (b.d13 >> (bits_max - 5)) + (b.d14 << (32 - (bits_max - 5)));
      a.d7 = (b.d14 >> (bits_max - 5)) + (b.d15 << (32 - (bits_max - 5)));

      mul_256_512_no_low8(&tmp256, a, u);		// tmp256 = (b / 2 ^ (bits_in_f - 5)) * (2 ^ (255 + bits_in_f) / f) / 2^256    (ignore the floor functions for now)

      a.d0 = (tmp256.d0 >> 4) + (tmp256.d1 << 28);	// a = tmp256 / 2^4, which if we do the math simplifies to the quotient: b / f
      a.d1 = (tmp256.d1 >> 4) + (tmp256.d2 << 28);
      a.d2 = (tmp256.d2 >> 4) + (tmp256.d3 << 28);
      a.d3 = (tmp256.d3 >> 4) + (tmp256.d4 << 28);
      a.d4 = (tmp256.d4 >> 4) + (tmp256.d5 << 28);
      a.d5 = (tmp256.d5 >> 4) + (tmp256.d6 << 28);
      a.d6 = (tmp256.d6 >> 4) + (tmp256.d7 << 28);
      a.d7 = (tmp256.d7 >> 4);
							// The quotient is off by at most 16/16.  A full mul_256_512 would add 15 partial results
							// into tmp512.d7 which could have generated 14 carries into tmp512.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d7 could have been added into
							// tmp512.d7 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d7 could have been added into tmp512.d7 possibly generating a carry.
							// A grand total of up to 16 carries lost.  However, we calculated an extra 4 bits of
							// precision in mul_256_512_no_low8 the shifted quotient is off by at most 16/16.

      mul_256_F192_223_special(&tmp256, a, f);		// tmp256 = quotient * f, we only compute the low 256-bits here

      a.d0 = __sub_cc (b.d0, tmp256.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp256.d1);		// we do not need the upper digits of b and tmp256 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp256.d2);
      a.d3 = __subc_cc(b.d3, tmp256.d3);
      a.d4 = __subc_cc(b.d4, tmp256.d4);
      a.d5 = __subc_cc(b.d5, tmp256.d5);
      a.d6 = __subc_cc(b.d6, tmp256.d6);
      a.d7 = __subc   (b.d7, tmp256.d7);
							// Since the quotient was up to 16/16 too small, the remainder has a maximum value of (1+16/16)*f,
							// or 249 bits + log2 (1+16/16) bits, which is 249.807 bits.  In theory, this kernel can handle
							// f values up to 2^249.257.
    }

    if(cmp_ge_256(a,f))					// final adjustment in case a >= f
    {
      sub_256(&a, a, f);
    }

#if 0
    if(cmp_ge_256(a,f))
    {
      printf("EEEEEK, final a is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_256_512_no_low8(&tmp256,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", f.d7, f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X %X\r\n", u.d7, u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X %X\r\n", tmp256.d7, tmp256.d6, tmp256.d5, tmp256.d4, tmp256.d3, tmp256.d2, tmp256.d1, tmp256.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", finalrem.d7, finalrem.d6, finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(a.d5 == 0 && a.d4 == 0 && a.d3 == 0 && a.d2 == 0 && a.d1 == 0)
    if((a.d6 == f.d6 && a.d0 == 0 && a.d7 == f.d7) ||
       (a.d6 == 0    && a.d0 == 1 && a.d7 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=8;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*8 + 2]=1;
	RES[index*8 + 3]=0;
	RES[index*8 + 4]=0;
	RES[index*8 + 5]=0;
	RES[index*8 + 6]=0;
	RES[index*8 + 7]=0;
	RES[index*8 + 8]=f.d6;
	RES[index*8 + 9]=f.d7;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 8);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=f.d7;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9] =a.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=a.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=a.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=a.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=a.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=a.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+15]=a.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+16]=a.d7;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 192 <= N <= 223.  Works on f between 250 and 252 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett252_F192_223gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int256 f, a;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int256 u, tmp256;
    int512 b, tmp512;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = 0;
    f.d5 = 0;
    f.d6 = (k.d0 << (exp - 192));
    f.d7 = (k.d1 << (exp - 192)) + (k.d0 >> (32 - (exp - 192)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d7);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d6);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp512.d15 = 1 << (bits_max - 1);			// tmp512 = 2^(255 + bits_in_f)
    tmp512.d14 = 0; tmp512.d13= 0; tmp512.d12= 0; tmp512.d11= 0;
    tmp512.d10 = 0; tmp512.d9 = 0; tmp512.d8 = 0; tmp512.d7 = 0; tmp512.d6 = 0;
    tmp512.d5 = 0; tmp512.d4 = 0; tmp512.d3 = 0; tmp512.d2 = 0; tmp512.d1 = 0; tmp512.d0 = 0;

    // Could write optimized div_512_256 with so many tmp512 elements known to be zero
    div_512_256(&u,tmp512,f,ff);			// u = floor(2^(255 + bits_in_f) / f), giving 256 bits of precision, requires f >= 225 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp512 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (255 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp512 / 2^256, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d7 >> (bits_max - 1));
    a.d1 = 0;
    a.d2 = 0;

    mul_256_F192_223_initial_special(&tmp256, a, f);	// tmp256 = quotient * f, we only compute the low 256-bits here

    a.d0 = __sub_cc (0, tmp256.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp256.d1);			// we do not need the upper digits of b_preinit and tmp256 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp256.d2);
    a.d3 = __subc_cc(0, tmp256.d3);
    a.d4 = __subc_cc(0, tmp256.d4);
    a.d5 = __subc_cc(0, tmp256.d5);
    a.d6 = __subc_cc(0, tmp256.d6);
    a.d7 = __subc   (0, tmp256.d7);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 252 bits (see end of this loop)

      square_256_512(&b, a);				// b = a^2, b is at most 504 bits

      tmp256.d0 = (b.d7  >> (bits_max - 1)) + (b.d8  << (33 - bits_max)); // a = b / (2 ^ (bits_in_f - 1)), a is at most 253 bits
      tmp256.d1 = (b.d8  >> (bits_max - 1)) + (b.d9  << (33 - bits_max));
      tmp256.d2 = (b.d9  >> (bits_max - 1)) + (b.d10 << (33 - bits_max));
      tmp256.d3 = (b.d10 >> (bits_max - 1)) + (b.d11 << (33 - bits_max));
      tmp256.d4 = (b.d11 >> (bits_max - 1)) + (b.d12 << (33 - bits_max));
      tmp256.d5 = (b.d12 >> (bits_max - 1)) + (b.d13 << (33 - bits_max));
      tmp256.d6 = (b.d13 >> (bits_max - 1)) + (b.d14 << (33 - bits_max));
      tmp256.d7 = (b.d14 >> (bits_max - 1)) + (b.d15 << (33 - bits_max));

      mul_256_512_no_low8(&a, tmp256, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (255 + bits_in_f) / f) / 2^256   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 16.  A full mul_256_512 would add 15 partial results
							// into tmp512.d7 which could have generated 14 carries into tmp512.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d7 could have been added into
							// tmp512.d7 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d7 could have been added into tmp512.d7 possibly generating a carry.
							// A grand total of up to 16 carries lost.

      mul_256_F192_223_special(&tmp256, a, f);		// tmp256 = quotient * f, we only compute the low 256-bits here

      tmp256.d0 = __sub_cc (b.d0, tmp256.d0);		// Compute the remainder
      tmp256.d1 = __subc_cc(b.d1, tmp256.d1);		// we do not need the upper digits of b and tmp256 because the result is 0 after subtraction!
      tmp256.d2 = __subc_cc(b.d2, tmp256.d2);
      tmp256.d3 = __subc_cc(b.d3, tmp256.d3);
      tmp256.d4 = __subc_cc(b.d4, tmp256.d4);
      tmp256.d5 = __subc_cc(b.d5, tmp256.d5);
      tmp256.d6 = __subc_cc(b.d6, tmp256.d6);
      tmp256.d7 = __subc   (b.d7, tmp256.d7);
							// Since the quotient was up to 16 too small, the remainder has a maximum value of 17*f,
							// or 252 bits + log2 (17) bits, which is 255.700 bits.

// Optimization:  Don't do the compare at the end of mod_simple_256.  Find a completely faster way to do this!  
      mod_simple_256(&a, tmp256, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 12 times f.
						        // The adjustment should be perfect, so we now have 252 bits.
    }

    if(cmp_ge_256(a,f))					// final adjustment in case a >= f
    {
      sub_256(&a, a, f);
    }

#if 0
    if(cmp_ge_256(a,f))
    {
      printf("EEEEEK, final a is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_256_512_no_low8(&tmp256,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", f.d7, f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X %X\r\n", u.d7, u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X %X\r\n", tmp256.d7, tmp256.d6, tmp256.d5, tmp256.d4, tmp256.d3, tmp256.d2, tmp256.d1, tmp256.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X%08X\r\n", a.d7, a.d6, a.d5, a.d4, a.d3, a.d2, a.d1, a.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(a.d5 == 0 && a.d4 == 0 && a.d3 == 0 && a.d2 == 0 && a.d1 == 0)
    if((a.d6 == f.d6 && a.d0 == 0 && a.d7 == f.d7) ||
       (a.d6 == 0    && a.d0 == 1 && a.d7 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=8;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*8 + 2]=1;
	RES[index*8 + 3]=0;
	RES[index*8 + 4]=0;
	RES[index*8 + 5]=0;
	RES[index*8 + 6]=0;
	RES[index*8 + 7]=0;
	RES[index*8 + 8]=f.d6;
	RES[index*8 + 9]=f.d7;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 8);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=f.d7;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9] =a.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=a.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=a.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=a.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=a.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=a.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+15]=a.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+16]=a.d7;
  }
}
