/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static void mul_192_F128_159_initial_special(int192 *res, int192 a, int192 b)
/* res = a * b (only lower 192 bits of the result), a.d3, a.d4. a.d5 are zero, b.d1, b.d2, b.d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0, %6;\n\t"          /* (a.d0 * b.d0).lo */
      "mov.u32         %1, %7;\n\t"          /* (a.d1 * b.d0).lo */
      "mov.u32         %2, %8;\n\t"          /* (a.d2 * b.d0).lo */
      "mov.u32         %3, 0;\n\t"
      "mul.lo.u32      %4, %6, %16;\n\t"     /* (a.d0 * b.d4).lo */
      "mul.lo.u32      %5, %6, %17;\n\t"     /* (a.d0 * b.d5).lo */

      "mad.hi.u32      %5, %6, %16, %5;\n\t" /* (a.d0 * b.d4).hi */

      "mad.lo.u32      %5, %7, %16, %5;\n\t" /* (a.d1 * b.d4).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5));
}


__device__ static void mul_192_F128_159_special(int192 *res, int192 a, int192 b)
/* res = a * b (only lower 192 bits of the result), b.d1, b.d2, b.d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %6;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mov.u32         %1,          %7;\n\t"  /*                    (a.d1 * b.d0).lo */
      "mov.u32         %2,          %8;\n\t"  /*                    (a.d2 * b.d0).lo */
      "mov.u32         %3,          %9;\n\t"  /*                    (a.d3 * b.d0).lo */
      "mad.lo.cc.u32   %4, %6, %16, %10;\n\t" /* (a.d0 * b.d4).lo + (a.d4 * b.d0).lo */
      "madc.lo.u32     %5, %6, %17, %11;\n\t" /* (a.d0 * b.d5).lo + (a.d5 * b.d0).lo */

      "mad.hi.u32      %5, %6, %16, %5;\n\t"  /* (a.d0 * b.d4).hi */

      "mad.lo.u32      %5, %7, %16, %5;\n\t"  /* (a.d1 * b.d4).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5));
}


__device__ static void mul_224_192_F128_159_initial_special16(int224 *res, int224 a, int192 b)
/* res = a * b (only lower 224 bits of the result), a.d3-d6 are zero, b.d1-d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %7;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mov.u32         %1,          %8;\n\t"  /*                    (a.d1 * b.d0).lo */
      "mov.u32         %2,          %9;\n\t"  /*                    (a.d2 * b.d0).lo */
      "mov.u32         %3,          0;\n\t"   /*                    (a.d3 * b.d0).lo */
      "mul.lo.u32      %4, %7, %18;\n\t"      /* (a.d0 * b.d4).lo + (a.d4 * b.d0).lo */
      "mul.lo.u32      %5, %7, %19;\n\t"      /* (a.d0 * b.d5).lo + (a.d5 * b.d0).lo */
      "mul.lo.u32      %6, %9, %18;\n\t"      /* (a.d2 * b.d4).lo + (a.d6 * b.d0).lo */

      "mad.hi.cc.u32   %5, %7, %18, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.u32     %6, %7, %19, %6;\n\t" /* (a.d0 * b.d5).hi */

      "mad.lo.cc.u32   %5, %8, %18, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.u32     %6, %8, %19, %6;\n\t" /* (a.d1 * b.d5).lo */

      "mad.hi.u32      %6, %8, %18, %6;\n\t" /* (a.d1 * b.d4).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5));
}


__device__ static void mul_224_192_F128_159_special(int224 *res, int224 a, int192 b)
/* res = a * b (only lower 224 bits of the result), b.d1-d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %7;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mov.u32         %1,          %8;\n\t"  /*                    (a.d1 * b.d0).lo */
      "mov.u32         %2,          %9;\n\t"  /*                    (a.d2 * b.d0).lo */
      "mov.u32         %3,          %10;\n\t" /*                    (a.d3 * b.d0).lo */
      "mad.lo.cc.u32   %4, %7, %18, %11;\n\t" /* (a.d0 * b.d4).lo + (a.d4 * b.d0).lo */
      "madc.lo.cc.u32  %5, %7, %19, %12;\n\t" /* (a.d0 * b.d5).lo + (a.d5 * b.d0).lo */
      "madc.lo.u32     %6, %9, %18, %13;\n\t" /* (a.d2 * b.d4).lo + (a.d6 * b.d0).lo */

      "mad.hi.cc.u32   %5, %7, %18, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.u32     %6, %7, %19, %6;\n\t" /* (a.d0 * b.d5).hi */

      "mad.lo.cc.u32   %5, %8, %18, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.u32     %6, %8, %19, %6;\n\t" /* (a.d1 * b.d5).lo */

      "mad.hi.u32      %6, %8, %18, %6;\n\t" /* (a.d1 * b.d4).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5));
}


__device__ static void mul_224_F128_159_initial_special(int224 *res, int224 a, int224 b)
/* res = a * b (only lower 224 bits of the result), a.d3-d6 are zero, b.d1-d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0, %7;\n\t"          /* (a.d0 * b.d0).lo */
      "mov.u32         %1, %8;\n\t"          /* (a.d1 * b.d0).lo */
      "mov.u32         %2, %9;\n\t"          /* (a.d2 * b.d0).lo */
      "mov.u32         %3,  0;\n\t"          /* (a.d3 * b.d0).lo */
      "mul.lo.u32      %4, %7, %18;\n\t"     /* (a.d0 * b.d4).lo */
      "mul.lo.u32      %5, %7, %19;\n\t"     /* (a.d0 * b.d5).lo */
      "mul.lo.u32      %6, %7, %20;\n\t"     /* (a.d0 * b.d6).lo */

      "mad.hi.cc.u32   %5, %7, %18, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.u32     %6, %7, %19, %6;\n\t" /* (a.d0 * b.d5).hi */

      "mad.lo.cc.u32   %5, %8, %18, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.u32     %6, %8, %19, %6;\n\t" /* (a.d1 * b.d5).lo */

      "mad.hi.u32      %6, %8, %18, %6;\n\t" /* (a.d1 * b.d4).hi */

      "mad.lo.u32      %6, %9, %18, %6;\n\t" /* (a.d2 * b.d4).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6));
}


__device__ static void mul_224_F128_159_special(int224 *res, int224 a, int224 b)
/* res = a * b (only lower 224 bits of the result), b.d1-d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %7;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mov.u32         %1,          %8;\n\t"  /*                    (a.d1 * b.d0).lo */
      "mov.u32         %2,          %9;\n\t"  /*                    (a.d2 * b.d0).lo */
      "mov.u32         %3,          %10;\n\t" /*                    (a.d3 * b.d0).lo */
      "mad.lo.cc.u32   %4, %7, %18, %11;\n\t" /* (a.d0 * b.d4).lo + (a.d4 * b.d0).lo */
      "madc.lo.cc.u32  %5, %7, %19, %12;\n\t" /* (a.d0 * b.d5).lo + (a.d5 * b.d0).lo */
      "madc.lo.u32     %6, %7, %20, %13;\n\t" /* (a.d0 * b.d6).lo + (a.d6 * b.d0).lo */

      "mad.hi.cc.u32   %5, %7, %18, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.u32     %6, %7, %19, %6;\n\t" /* (a.d0 * b.d5).hi */

      "mad.lo.cc.u32   %5, %8, %18, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.u32     %6, %8, %19, %6;\n\t" /* (a.d1 * b.d5).lo */

      "mad.hi.u32      %6, %8, %18, %6;\n\t" /* (a.d1 * b.d4).hi */

      "mad.lo.u32      %6, %9, %18, %6;\n\t" /* (a.d2 * b.d4).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6));
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 161 and 172 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett172_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int192 a, u, tmp192;
    int352 b, tmp352;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp352.d10 = 0xFFFFFFFF;				// tmp352 is nearly 2^352
    tmp352.d9 = 0xFFFFFFFF; tmp352.d8 = 0xFFFFFFFF; tmp352.d7 = 0xFFFFFFFF; tmp352.d6 = 0xFFFFFFFF; tmp352.d5 = 0xFFFFFFFF;
    tmp352.d4 = 0xFFFFFFFF; tmp352.d3 = 0xFFFFFFFF; tmp352.d2 = 0xFFFFFFFF; tmp352.d1 = 0xFFFFFFFF; tmp352.d0 = 0xFFFFFFFF;

    // Could write optimized div_352_192 with so many tmp352 elements known to be zero
    div_352_192(&u,tmp352,f,ff);			// u = floor(2^352 / f).  This requires f >= 161 bits.

							// b_preinit = 2^256
							// a = b_preinit / 2^160 = 2^96
							// tmp352 = a * u = (b_preinit / 2^160) * (2^352 / f)     (ignore the floor functions for now)
    a.d0 = u.d3;					// a = tmp352 / 2^192, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 = u.d4;
    a.d2 = u.d5;

    mul_192_F128_159_initial_special(&tmp192, a, f);	// tmp192 = quotient * f, we only compute the low 192-bits here

    a.d0 = __sub_cc (0, tmp192.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp192.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp192.d2);
    a.d3 = __subc_cc(0, tmp192.d3);
    a.d4 = __subc_cc(0, tmp192.d4);
    a.d5 = __subc   (0, tmp192.d5);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 175.700 bits (see end of this loop)

      square_192_352(&b, a);				// b = a^2, b is at most 351.400 bits

      tmp192.d0 = b.d5;					// a = b / 2^160, a is at most 191.400 bits
      tmp192.d1 = b.d6;
      tmp192.d2 = b.d7;
      tmp192.d3 = b.d8;
      tmp192.d4 = b.d9;
      tmp192.d5 = b.d10;

      mul_192_384_no_low6(&a, tmp192, u);		// a = (b / 2^160) * (2^352 / f) / 2^192    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 12.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.

      mul_192_F128_159_special(&tmp192, a, f);		// tmp192 = quotient * f, we only compute the low 192-bits here

      a.d0 = __sub_cc (b.d0, tmp192.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp192.d1);		// we do not need the upper digits of b and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp192.d2);
      a.d3 = __subc_cc(b.d3, tmp192.d3);
      a.d4 = __subc_cc(b.d4, tmp192.d4);
      a.d5 = __subc   (b.d5, tmp192.d5);
							// Since the quotient was up to 12 too small, the remainder has a maximum value of 13*f,
							// or 172 bits + log2 (13) bits, which is 175.700 bits.
    }

    mod_simple_192(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_192(finalrem,f) && f.d5)
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_192_384_no_low6(&tmp192,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d4 == f.d4 && finalrem.d0 == 0 && finalrem.d5 == f.d5) ||
       (finalrem.d4 == 0    && finalrem.d0 == 1 && finalrem.d5 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=1;
	RES[index*6 + 3]=0;
	RES[index*6 + 4]=0;
	RES[index*6 + 5]=0;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d5;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 173 and 183 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett183_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int192 a, u, tmp192;
    int384 b, tmp384;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp384.d11 = 1 << (bits_max - 1);			// tmp384 = 2^(191 + bits_in_f)
    tmp384.d10 = 0; tmp384.d9 = 0; tmp384.d8 = 0; tmp384.d7 = 0; tmp384.d6 = 0;
    tmp384.d5 = 0; tmp384.d4 = 0; tmp384.d3 = 0; tmp384.d2 = 0; tmp384.d1 = 0; tmp384.d0 = 0;

    // Could write optimized div_384_192 with so many tmp384 elements known to be zero
    div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp384 / 2^192, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d3 >> (bits_max - 1)) + (u.d4 << (32 - (bits_max - 1)));
    a.d1 = (u.d4 >> (bits_max - 1)) + (u.d5 << (32 - (bits_max - 1)));
    a.d2 = (u.d5 >> (bits_max - 1));

    mul_192_F128_159_initial_special(&tmp192, a, f);	// tmp192 = quotient * f, we only compute the low 192-bits here

    a.d0 = __sub_cc (0, tmp192.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp192.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp192.d2);
    a.d3 = __subc_cc(0, tmp192.d3);
    a.d4 = __subc_cc(0, tmp192.d4);
    a.d5 = __subc   (0, tmp192.d5);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 186.700 bits (see end of this loop)

      square_192_384(&b, a);				// b = a^2, b is at most 373.400 bits

      tmp192.d0 = (b.d5 >> (bits_max - 1)) + (b.d6 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 191.400 bits
      tmp192.d1 = (b.d6 >> (bits_max - 1)) + (b.d7 << (32 - (bits_max - 1)));
      tmp192.d2 = (b.d7 >> (bits_max - 1)) + (b.d8 << (32 - (bits_max - 1)));
      tmp192.d3 = (b.d8 >> (bits_max - 1)) + (b.d9 << (32 - (bits_max - 1)));
      tmp192.d4 = (b.d9 >> (bits_max - 1)) + (b.d10 << (32 - (bits_max - 1)));
      tmp192.d5 = (b.d10 >> (bits_max - 1)) + (b.d11 << (32 - (bits_max - 1)));

      mul_192_384_no_low6(&a, tmp192, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f) / 2^192   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 12.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.

      mul_192_F128_159_special(&tmp192, a, f);		// tmp192 = quotient * f, we only compute the low 192-bits here

      a.d0 = __sub_cc (b.d0, tmp192.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp192.d1);		// we do not need the upper digits of b and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp192.d2);
      a.d3 = __subc_cc(b.d3, tmp192.d3);
      a.d4 = __subc_cc(b.d4, tmp192.d4);
      a.d5 = __subc   (b.d5, tmp192.d5);
							// Since the quotient was up to 12 too small, the remainder has a maximum value of 13*f,
							// or 183 bits + log2 (13) bits, which is 186.700 bits.  In theory, this kernel can handle
							// f values up to 2^183.300.
    }

    mod_simple_192(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_192(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_192_384_no_low6(&tmp192,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d4 == f.d4 && finalrem.d0 == 0 && finalrem.d5 == f.d5) ||
       (finalrem.d4 == 0    && finalrem.d0 == 1 && finalrem.d5 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=1;
	RES[index*6 + 3]=0;
	RES[index*6 + 4]=0;
	RES[index*6 + 5]=0;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d5;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 184 and 185 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett185_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, a;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int192 u, tmp192;
    int384 b, tmp384;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp384.d11 = 1 << (bits_max - 1);			// tmp384 = 2^(191 + bits_in_f)
    tmp384.d10 = 0; tmp384.d9 = 0; tmp384.d8 = 0; tmp384.d7 = 0; tmp384.d6 = 0;
    tmp384.d5 = 0; tmp384.d4 = 0; tmp384.d3 = 0; tmp384.d2 = 0; tmp384.d1 = 0; tmp384.d0 = 0;

    // Could write optimized div_384_192 with so many tmp384 elements known to be zero
    div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp384 / 2^192, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d3 >> (bits_max - 1)) + (u.d4 << (32 - (bits_max - 1)));
    a.d1 = (u.d4 >> (bits_max - 1)) + (u.d5 << (32 - (bits_max - 1)));
    a.d2 = (u.d5 >> (bits_max - 1));

    mul_192_F128_159_initial_special(&tmp192, a, f);	// tmp192 = quotient * f, we only compute the low 192-bits here

    a.d0 = __sub_cc (0, tmp192.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp192.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp192.d2);
    a.d3 = __subc_cc(0, tmp192.d3);
    a.d4 = __subc_cc(0, tmp192.d4);
    a.d5 = __subc   (0, tmp192.d5);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 185.858 bits (see end of this loop)

      square_192_384(&b, a);				// b = a^2, b is at most 371.716 bits

      a.d0 = (b.d5 >> (bits_max - 5)) + (b.d6 << (32 - (bits_max - 5))); // a = b / (2 ^ (bits_in_f - 5)), a is at most 191.716 bits
      a.d1 = (b.d6 >> (bits_max - 5)) + (b.d7 << (32 - (bits_max - 5)));
      a.d2 = (b.d7 >> (bits_max - 5)) + (b.d8 << (32 - (bits_max - 5)));
      a.d3 = (b.d8 >> (bits_max - 5)) + (b.d9 << (32 - (bits_max - 5)));
      a.d4 = (b.d9 >> (bits_max - 5)) + (b.d10 << (32 - (bits_max - 5)));
      a.d5 = (b.d10 >> (bits_max - 5)) + (b.d11 << (32 - (bits_max - 5)));

      mul_192_384_no_low6(&tmp192, a, u);		// tmp192 = (b / 2 ^ (bits_in_f - 5)) * (2 ^ (191 + bits_in_f) / f) / 2^192    (ignore the floor functions for now)

      a.d0 = (tmp192.d0 >> 4) + (tmp192.d1 << 28);	// a = tmp192 / 2^4, which if we do the math simplifies to the quotient: b / f
      a.d1 = (tmp192.d1 >> 4) + (tmp192.d2 << 28);
      a.d2 = (tmp192.d2 >> 4) + (tmp192.d3 << 28);
      a.d3 = (tmp192.d3 >> 4) + (tmp192.d4 << 28);
      a.d4 = (tmp192.d4 >> 4) + (tmp192.d5 << 28);
      a.d5 = (tmp192.d5 >> 4);
							// The quotient is off by at most 12/16.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.  However, we calculated an extra 4 bits of
							// precision in mul_192_384_no_low6 the shifted quotient is off by at most 12/16.

      mul_192_F128_159_special(&tmp192, a, f);		// tmp192 = quotient * f, we only compute the low 192-bits here

      a.d0 = __sub_cc (b.d0, tmp192.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp192.d1);		// we do not need the upper digits of b and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp192.d2);
      a.d3 = __subc_cc(b.d3, tmp192.d3);
      a.d4 = __subc_cc(b.d4, tmp192.d4);
      a.d5 = __subc   (b.d5, tmp192.d5);
							// Since the quotient was up to 12/16 too small, the remainder has a maximum value of (1+12/16)*f,
							// or 185 bits + log2 (1+12/16) bits, which is 185.807 bits.  In theory, this kernel can handle
							// f values up to 2^185.193.
    }

    if(cmp_ge_192(a,f))					// final adjustment in case a >= f
    {
      sub_192(&a, a, f);
    }

#if 0
    if(cmp_ge_192(a,f))
    {
      printf("EEEEEK, final a is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_192_384_no_low6(&tmp192,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", a.d5, a.d4, a.d3, a.d2, a.d1, a.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(a.d3 == 0 && a.d2 == 0 && a.d1 == 0)
    if((a.d4 == f.d4 && a.d0 == 0 && a.d5 == f.d5) ||
       (a.d4 == 0    && a.d0 == 1 && a.d5 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=1;
	RES[index*6 + 3]=0;
	RES[index*6 + 4]=0;
	RES[index*6 + 5]=0;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=a.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=a.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=a.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=a.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=a.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=a.d5;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 186 and 188 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett188_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, a;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int192 u, tmp192;
    int384 b, tmp384;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp384.d11 = 1 << (bits_max - 1);			// tmp384 = 2^(191 + bits_in_f)
    tmp384.d10 = 0; tmp384.d9 = 0; tmp384.d8 = 0; tmp384.d7 = 0; tmp384.d6 = 0;
    tmp384.d5 = 0; tmp384.d4 = 0; tmp384.d3 = 0; tmp384.d2 = 0; tmp384.d1 = 0; tmp384.d0 = 0;

    // Could write optimized div_384_192 with so many tmp384 elements known to be zero
    div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp384 / 2^192, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d3 >> (bits_max - 1)) + (u.d4 << (32 - (bits_max - 1)));
    a.d1 = (u.d4 >> (bits_max - 1)) + (u.d5 << (32 - (bits_max - 1)));
    a.d2 = (u.d5 >> (bits_max - 1));

    mul_192_F128_159_initial_special(&tmp192, a, f);	// tmp192 = quotient * f, we only compute the low 192-bits here

    a.d0 = __sub_cc (0, tmp192.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp192.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp192.d2);
    a.d3 = __subc_cc(0, tmp192.d3);
    a.d4 = __subc_cc(0, tmp192.d4);
    a.d5 = __subc   (0, tmp192.d5);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 188 bits (see end of this loop)

      square_192_384(&b, a);				// b = a^2, b is at most 376 bits

      tmp192.d0 = (b.d5 >> (bits_max - 1)) + (b.d6 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 189 bits
      tmp192.d1 = (b.d6 >> (bits_max - 1)) + (b.d7 << (32 - (bits_max - 1)));
      tmp192.d2 = (b.d7 >> (bits_max - 1)) + (b.d8 << (32 - (bits_max - 1)));
      tmp192.d3 = (b.d8 >> (bits_max - 1)) + (b.d9 << (32 - (bits_max - 1)));
      tmp192.d4 = (b.d9 >> (bits_max - 1)) + (b.d10 << (32 - (bits_max - 1)));
      tmp192.d5 = (b.d10 >> (bits_max - 1)) + (b.d11 << (32 - (bits_max - 1)));

      mul_192_384_no_low6(&a, tmp192, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f) / 2^192   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 12.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.

      mul_192_F128_159_special(&tmp192, a, f);		// tmp192 = quotient * f, we only compute the low 192-bits here

      tmp192.d0 = __sub_cc (b.d0, tmp192.d0);		// Compute the remainder
      tmp192.d1 = __subc_cc(b.d1, tmp192.d1);		// we do not need the upper digits of b and tmp192 because the result is 0 after subtraction!
      tmp192.d2 = __subc_cc(b.d2, tmp192.d2);
      tmp192.d3 = __subc_cc(b.d3, tmp192.d3);
      tmp192.d4 = __subc_cc(b.d4, tmp192.d4);
      tmp192.d5 = __subc   (b.d5, tmp192.d5);
							// Since the quotient was up to 12 too small, the remainder has a maximum value of 13*f,
							// or 188 bits + log2 (13) bits, which is 191.700 bits.

// Optimization:  Don't do the compare at the end of mod_simple_192.  Find a completely faster way to do this!  
      mod_simple_192(&a, tmp192, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 12 times f.
						        // The adjustment should be perfect, so we now have 188 bits.
    }

    if(cmp_ge_192(a,f))					// final adjustment in case a >= f
    {
      sub_192(&a, a, f);
    }

#if 0
    if(cmp_ge_192(a,f))
    {
      printf("EEEEEK, final a is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_192_384_no_low6(&tmp192,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", a.d5, a.d4, a.d3, a.d2, a.d1, a.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(a.d3 == 0 && a.d2 == 0 && a.d1 == 0)
    if((a.d4 == f.d4 && a.d0 == 0 && a.d5 == f.d5) ||
       (a.d4 == 0    && a.d0 == 1 && a.d5 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=1;
	RES[index*6 + 3]=0;
	RES[index*6 + 4]=0;
	RES[index*6 + 5]=0;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=a.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=a.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=a.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=a.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=a.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=a.d5;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 183 and 192 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett192_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int224 a, u, tmp224;
    int416 b, tmp416;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp416.d12= 0x10000;				// tmp416 is 2^400
    tmp416.d11= 0; tmp416.d10= 0; tmp416.d9 = 0; tmp416.d8 = 0;
    tmp416.d7 = 0; tmp416.d6 = 0; tmp416.d5 = 0; tmp416.d4 = 0;
    tmp416.d3 = 0; tmp416.d2 = 0; tmp416.d1 = 0; tmp416.d0 = 0;

    // Could write optimized div_416_192 with so many tmp416 elements known to be zero
    div_416_192(&u,tmp416,f,ff);			// u = floor(2^400 / f).  This requires f >= 177 bits.

							// b_preinit = 2^256
							// a = b_preinit / 2^176 = 2^80
							// tmp416 = a * u = (b_preinit / 2^176) * (2^400 / f)     (ignore the floor functions for now)
    a.d0 = (u.d5 << 16) + (u.d4 >> 16);			// a = tmp416 / 2^224, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 = (u.d6 << 16) + (u.d5 >> 16);
    a.d2 =                (u.d6 >> 16);

    mul_224_192_F128_159_initial_special16(&tmp224, a, f); // tmp224 = quotient * f, we only compute the low 224-bits here

    a.d0 = __sub_cc (0, tmp224.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp224.d1);			// we do not need the upper digits of b_preinit and tmp224 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp224.d2);
    a.d3 = __subc_cc(0, tmp224.d3);
    a.d4 = __subc_cc(0, tmp224.d4);
    a.d5 = __subc_cc(0, tmp224.d5);
    a.d6 = __subc   (0, tmp224.d6);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 195.659 bits (see end of this loop)

      square_224_416(&b, a);				// b = a^2, b is at most 391.318 bits

      tmp224.d0 = (b.d6 << 16) + (b.d5 >> 16);		// a = b / 2^176, a is at most 215.318 bits
      tmp224.d1 = (b.d7 << 16) + (b.d6 >> 16);
      tmp224.d2 = (b.d8 << 16) + (b.d7 >> 16);
      tmp224.d3 = (b.d9 << 16) + (b.d8 >> 16);
      tmp224.d4 = (b.d10<< 16) + (b.d9 >> 16);
      tmp224.d5 = (b.d11<< 16) + (b.d10>> 16);
      tmp224.d6 = (b.d12<< 16) + (b.d11>> 16);

      mul_224_448_no_low7(&a, tmp224, u);		// a = (b / 2^176) * (2^400 / f) / 2^224    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 14.  A full mul_224_416 would add 13 partial results
							// into tmp416.d6 which could have generated 12 carries into tmp416.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d6 could have been added into
							// tmp416.d6 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d6 could have been added into tmp416.d6 possibly generating a carry.
							// A grand total of up to 14 carries lost.

      mul_224_192_F128_159_special(&tmp224, a, f);	// tmp224 = quotient * f, we only compute the low 224-bits here

      a.d0 = __sub_cc (b.d0, tmp224.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp224.d1);		// we do not need the upper digits of b and tmp224 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp224.d2);
      a.d3 = __subc_cc(b.d3, tmp224.d3);
      a.d4 = __subc_cc(b.d4, tmp224.d4);
      a.d5 = __subc_cc(b.d5, tmp224.d5);
      a.d6 = __subc   (b.d6, tmp224.d6);
							// Since the quotient was up to 14 too small, the remainder has a maximum value of 15*f,
							// or 192 bits + log2 (15) bits, which is 195.659 bits.
    }

    mod_simple_224_192(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 15 times f.

#if 0
    if(cmp_ge_192(finalrem,f) && (f.d5 & 0xFFFF0000))
    {
      printf("EEEEEK, final rem is >= f\n");
    }
if ((blockIdx.x == 0 && threadIdx.x == 0)){
int224 f224;	    
f224.d0 = f.d0;
f224.d1 = f.d1;
f224.d2 = f.d2;
f224.d3 = f.d3;
f224.d4 = f.d4;
f224.d5 = f.d5;
f224.d6 = 0;
mul_224_448_no_low7(&tmp224,u,f224);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %08X%08X%08X%08X%08X%08X%08X\r\n", u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %08X%08X%08X%08X%08X%08X%08X\r\n", tmp224.d6, tmp224.d5, tmp224.d4, tmp224.d3, tmp224.d2, tmp224.d1, tmp224.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d4 == f.d4 && finalrem.d0 == 0 && finalrem.d5 == f.d5) ||
       (finalrem.d4 == 0    && finalrem.d0 == 1 && finalrem.d5 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=1;
	RES[index*6 + 3]=0;
	RES[index*6 + 4]=0;
	RES[index*6 + 5]=0;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d5;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 193 and 204 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett204_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int224 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int224 a, u, tmp224;
    int416 b, tmp416;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));
    f.d6 =                         (k.d1 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d6);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d5);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp416.d12 = 0xFFFFFFFF;				// tmp416 is nearly 2^416
    tmp416.d11 = tmp416.d10 = tmp416.d9 = 0xFFFFFFFF; tmp416.d8 = 0xFFFFFFFF; tmp416.d7 = 0xFFFFFFFF; tmp416.d6 = 0xFFFFFFFF; tmp416.d5 = 0xFFFFFFFF;
    tmp416.d4 = 0xFFFFFFFF; tmp416.d3 = 0xFFFFFFFF; tmp416.d2 = 0xFFFFFFFF; tmp416.d1 = 0xFFFFFFFF; tmp416.d0 = 0xFFFFFFFF;

    // Could write optimized div_416_224 with so many tmp416 elements known to be zero
    div_416_224(&u,tmp416,f,ff);			// u = floor(2^416 / f).  This requires f >= 161 bits.

							// b_preinit = 2^256
							// a = b_preinit / 2^192 = 2^64
							// tmp416 = a * u = (b_preinit / 2^192) * (2^416 / f)     (ignore the floor functions for now)
    a.d0 = u.d5;					// a = tmp416 / 2^224, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 = u.d6;
    a.d2 = 0;

    mul_224_F128_159_initial_special(&tmp224, a, f);	// tmp224 = quotient * f, we only compute the low 224-bits here

    a.d0 = __sub_cc (0, tmp224.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp224.d1);			// we do not need the upper digits of b_preinit and tmp224 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp224.d2);
    a.d3 = __subc_cc(0, tmp224.d3);
    a.d4 = __subc_cc(0, tmp224.d4);
    a.d5 = __subc_cc(0, tmp224.d5);
    a.d6 = __subc   (0, tmp224.d6);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 207.907 bits (see end of this loop)

      square_224_416(&b, a);				// b = a^2, b is at most 415.814 bits

      tmp224.d0 = b.d6;					// a = b / 2^192, a is at most 223.814 bits
      tmp224.d1 = b.d7;
      tmp224.d2 = b.d8;
      tmp224.d3 = b.d9;
      tmp224.d4 = b.d10;
      tmp224.d5 = b.d11;
      tmp224.d6 = b.d12;

      mul_224_448_no_low7(&a, tmp224, u);		// a = (b / 2^192) * (2^416 / f) / 2^224    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 14.  A full mul_224_448 would add 13 partial results
							// into tmp448.d6 which could have generated 12 carries into tmp448.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d6 could have been added into
							// tmp448.d6 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d6 could have been added into tmp448.d6 possibly generating a carry.
							// A grand total of up to 14 carries lost.

      mul_224_F128_159_special(&tmp224, a, f);		// tmp224 = quotient * f, we only compute the low 224-bits here

      a.d0 = __sub_cc (b.d0, tmp224.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp224.d1);		// we do not need the upper digits of b and tmp224 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp224.d2);
      a.d3 = __subc_cc(b.d3, tmp224.d3);
      a.d4 = __subc_cc(b.d4, tmp224.d4);
      a.d5 = __subc_cc(b.d5, tmp224.d5);
      a.d6 = __subc   (b.d6, tmp224.d6);
							// Since the quotient was up to 14 too small, the remainder has a maximum value of 15*f,
							// or 204 bits + log2 (13) bits, which is 207.907 bits.
    }

    mod_simple_224(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 15 times f.

#if 0
    if(cmp_ge_224(finalrem,f) && f.d5)
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 12 && threadIdx.x == 4){
mul_224_448_no_low7(&tmp224,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X\r\n", f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X\r\n", u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X\r\n", tmp224.d6, tmp224.d5, tmp224.d4, tmp224.d3, tmp224.d2, tmp224.d1, tmp224.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X\r\n", finalrem.d6, finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d5 == f.d5 && finalrem.d4 == f.d4 && finalrem.d0 == 0 && finalrem.d6 == f.d6) ||
       (finalrem.d5 == 0    && finalrem.d4 == 0    && finalrem.d0 == 1 && finalrem.d6 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=7;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*7 + 2]=1;
	RES[index*7 + 3]=0;
	RES[index*7 + 4]=0;
	RES[index*7 + 5]=0;
	RES[index*7 + 6]=f.d4;
	RES[index*7 + 7]=f.d5;
	RES[index*7 + 8]=f.d6;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 7);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]= finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]= finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=finalrem.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=finalrem.d6;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 128 <= N <= 159.  Works on f between 205 and 215 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett215_F128_159gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int224 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int224 a, u, tmp224;
    int448 b, tmp448;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = 0;
    f.d2 = 0;
    f.d3 = 0;
    f.d4 = (k.d0 << (exp - 128));
    f.d5 = (k.d1 << (exp - 128)) + (k.d0 >> (32 - (exp - 128)));
    f.d6 =                         (k.d1 >> (32 - (exp - 128)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d6);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d5);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp448.d13 = 1 << (bits_max - 1);			// tmp448 = 2^(207 + bits_in_f)
    tmp448.d12 = tmp448.d11 = tmp448.d10 = 0; tmp448.d9 = 0; tmp448.d8 = 0; tmp448.d7 = 0; tmp448.d6 = 0;
    tmp448.d5 = 0; tmp448.d4 = 0; tmp448.d3 = 0; tmp448.d2 = 0; tmp448.d1 = 0; tmp448.d0 = 0;

    // Could write optimized div_448_224 with so many tmp448 elements known to be zero
    div_448_224(&u,tmp448,f,ff);			// u = floor(2^(207 + bits_in_f) / f), giving 224 bits of precision, requires f >= 161 bits

							// b_preinit = 2^256
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp448 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (207 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp448 / 2^224, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d5 >> (bits_max - 1)) + (u.d6 << (32 - (bits_max - 1)));
    a.d1 = (u.d6 >> (bits_max - 1));
    a.d2 = 0;

    mul_224_F128_159_initial_special(&tmp224, a, f);	// tmp224 = quotient * f, we only compute the low 224-bits here

    a.d0 = __sub_cc (0, tmp224.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp224.d1);			// we do not need the upper digits of b_preinit and tmp224 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp224.d2);
    a.d3 = __subc_cc(0, tmp224.d3);
    a.d4 = __subc_cc(0, tmp224.d4);
    a.d5 = __subc_cc(0, tmp224.d5);
    a.d6 = __subc   (0, tmp224.d6);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 218.907 bits (see end of this loop)

      square_224_448(&b, a);				// b = a^2, b is at most 437.814 bits

      tmp224.d0 = (b.d6 >> (bits_max - 1)) + (b.d7 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 223.814 bits
      tmp224.d1 = (b.d7 >> (bits_max - 1)) + (b.d8 << (32 - (bits_max - 1)));
      tmp224.d2 = (b.d8 >> (bits_max - 1)) + (b.d9 << (32 - (bits_max - 1)));
      tmp224.d3 = (b.d9 >> (bits_max - 1)) + (b.d10 << (32 - (bits_max - 1)));
      tmp224.d4 = (b.d10 >> (bits_max - 1)) + (b.d11 << (32 - (bits_max - 1)));
      tmp224.d5 = (b.d11 >> (bits_max - 1)) + (b.d12 << (32 - (bits_max - 1)));
      tmp224.d6 = (b.d12 >> (bits_max - 1)) + (b.d13 << (32 - (bits_max - 1)));

      mul_224_448_no_low7(&a, tmp224, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f) / 2^224   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 14.  A full mul_224_448 would add 13 partial results
							// into tmp448.d6 which could have generated 12 carries into tmp448.d7.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d6 could have been added into
							// tmp448.d6 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d6 could have been added into tmp448.d6 possibly generating a carry.
							// A grand total of up to 14 carries lost.

      mul_224_F128_159_special(&tmp224, a, f);		// tmp224 = quotient * f, we only compute the low 224-bits here

      a.d0 = __sub_cc (b.d0, tmp224.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp224.d1);		// we do not need the upper digits of b and tmp224 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp224.d2);
      a.d3 = __subc_cc(b.d3, tmp224.d3);
      a.d4 = __subc_cc(b.d4, tmp224.d4);
      a.d5 = __subc_cc(b.d5, tmp224.d5);
      a.d6 = __subc   (b.d6, tmp224.d6);
							// Since the quotient was up to 14 too small, the remainder has a maximum value of 15*f,
							// or 215 bits + log2 (15) bits, which is 218.907 bits.
    }

    mod_simple_224(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 15 times f.

#if 0
    if(cmp_ge_224(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 12 && threadIdx.x == 4){
mul_224_448_no_low7(&tmp224,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X%08X\r\n", f.d6, f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X %X\r\n", u.d6, u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X %X\r\n", tmp224.d6, tmp224.d5, tmp224.d4, tmp224.d3, tmp224.d2, tmp224.d1, tmp224.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X%08X\r\n", finalrem.d6, finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d3 == 0 && finalrem.d2 == 0 && finalrem.d1 == 0)
    if((finalrem.d5 == f.d5 && finalrem.d4 == f.d4 && finalrem.d0 == 0 && finalrem.d6 == f.d6) ||
       (finalrem.d5 == 0    && finalrem.d4 == 0    && finalrem.d0 == 1 && finalrem.d6 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=7;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*7 + 2]=1;
	RES[index*7 + 3]=0;
	RES[index*7 + 4]=0;
	RES[index*7 + 5]=0;
	RES[index*7 + 6]=f.d4;
	RES[index*7 + 7]=f.d5;
	RES[index*7 + 8]=f.d6;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 7);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=f.d6;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]= finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]= finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+13]=finalrem.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+14]=finalrem.d6;
  }
}
