/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static void mulsub_96_M31_initial_special(int96 *res, int96 a, int96 negb)
/* res = 0 - a * b (only lower 96 bits of the result), a.d2 is zero */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %3, %6;\n\t"        /* c += (a.d0 * negb.d0).lo */
      "mul.lo.u32      %1, %3, %7;\n\t"        /* c += (a.d0 * negb.d1).lo */
      "mul.lo.u32      %2, %3, %8;\n\t"        /* c += (a.d0 * negb.d2).lo */

      "mad.hi.cc.u32   %1, %3, %6, %1;\n\t"    /* c += (a.d0 * negb.d0).hi */
      "madc.hi.u32     %2, %3, %7, %2;\n\t"    /* c += (a.d0 * negb.d1).hi */

      "mad.lo.cc.u32   %1, %4, %6, %1;\n\t"    /* c += (a.d1 * negb.d0).lo */
      "madc.lo.u32     %2, %4, %7, %2;\n\t"    /* c += (a.d1 * negb.d1).lo */

      "mad.hi.u32      %2, %4, %6, %2;\n\t"    /* c += (a.d1 * negb.d0).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2),
        "r" (negb.d0), "r" (negb.d1), "r" (negb.d2));
}


__device__ static void mulsub_128_M31_initial_special(int128 *res, int128 a, int128 negb)
/* res = 0 - a * b (only lower 128 bits of the result), a.d2, a.d3 are zero */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %4, %8;\n\t"        /* c += (a.d0 * negb.d0).lo */
      "mul.lo.u32      %1, %4, %9;\n\t"        /* c += (a.d0 * negb.d1).lo */
      "mul.lo.u32      %2, %4, %10;\n\t"       /* c += (a.d0 * negb.d2).lo */
      "mul.lo.u32      %3, %4, %11;\n\t"       /* c += (a.d0 * negb.d3).lo */

      "mad.hi.cc.u32   %1, %4, %8, %1;\n\t"    /* c += (a.d0 * negb.d0).hi */
      "madc.hi.cc.u32  %2, %4, %9, %2;\n\t"    /* c += (a.d0 * negb.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t"   /* c += (a.d0 * negb.d2).hi */

      "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"    /* c += (a.d1 * negb.d0).lo */
      "madc.lo.cc.u32  %2, %5, %9, %2;\n\t"    /* c += (a.d1 * negb.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t"   /* c += (a.d1 * negb.d2).lo */

      "mad.hi.cc.u32   %2, %5, %8, %2;\n\t"    /* c += (a.d1 * negb.d0).hi */
      "madc.hi.u32     %3, %5, %9, %3;\n\t"    /* c += (a.d1 * negb.d1).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3),
        "r" (negb.d0), "r" (negb.d1), "r" (negb.d2), "r" (negb.d3));
}


//
// Kernel to factor MM31, works on f between 65 and 89 bits inclusive (k = 2^31 to k = 2^57)
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett89_M31gs(int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int96 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int96 a, u, tmp96, negf;
    int192 b, tmp192;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = 2 * k * M31 + 1 = k * 2^32 - 2k + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;				// f = k * 2^62 + 1
    f.d1 = k.d0;
    f.d2 = k.d1;

    k.d0 = __add_cc (k.d0, k.d0);	// 2k
    k.d1 = __addc   (k.d1, k.d1);

    f.d0 = __sub_cc  (f.d0, k.d0);	// f -= 2k
    f.d1 = __subc_cc (f.d1, k.d1);
    f.d2 = __subc    (f.d2, 0);

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d2);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d1);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp192.d5 = 1 << (bits_max - 1);			// tmp192 = 2^(95 + bits_in_f)
    tmp192.d4 = 0; tmp192.d3 = 0; tmp192.d2 = 0; tmp192.d1 = 0; tmp192.d0 = 0;

    // Could write optimized div_192_96 with so many tmp192 elements known to be zero
    div_192_96(&u,tmp192,f,ff);				// u = floor(2^(95 + bits_in_f) / f), giving 96 bits of precision

							// b_preinit = 2^128
							// Let a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp96 = a * u / 2^96
							//        = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f) / 2^96  (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b_preinit / f
    tmp96.d0 = (u.d1 >> (bits_max - 1)) + (u.d2 << (32 - (bits_max - 1)));
    tmp96.d1 = (u.d2 >> (bits_max - 1));

    negf.d0 = __sub_cc (0, f.d0);			// Negate f so that the mulsub macros can use multiply-accumulate instructions
    negf.d1 = __subc_cc(0, f.d1);
    negf.d2 = __subc   (0, f.d2);

    mulsub_96_M31_initial_special(&a, tmp96, negf);	// Compute the remainder: b_preinit - quotient * f, we only compute the low 96-bits here

    for (shifter = 0; shifter < 24; shifter++)
    {
							// On input a is at most 91.807 bits (see end of this loop)

      square_96_192(&b, a);				// b = a^2, b is at most 183.614 bits

      a.d0 = (b.d2 >> (bits_max - 1)) + (b.d3 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 95.614 bits
      a.d1 = (b.d3 >> (bits_max - 1)) + (b.d4 << (32 - (bits_max - 1)));
      a.d2 = (b.d4 >> (bits_max - 1)) + (b.d5 << (32 - (bits_max - 1)));

      mul_96_192_no_low3(&tmp96, a, u);			// tmp96 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f) / 2^96   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 6.  A full mul_96_192 would add 5 partial results
							// into tmp192.d2 which could have generated 4 carries into tmp192.d3.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d2 could have been added into
							// tmp192.d2 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d2 could have been added into tmp192.d2 possibly generating a carry.
							// A grand total of up to 6 carries lost.

      mulsub_96(&a, b, tmp96, negf);			// Compute the remainder: b - quotient * f, we only compute the low 96-bits here

							// Since the quotient was up to 6 too small, the remainder has a maximum value of 7*f,
							// or 89 bits + log2 (7) bits, which is 91.807 bits.
    }

    mod_simple_96(&finalrem, a, f, ff);			// Adjustment.  The code above may produce an a that is too large by up to 6 times f.

#if 0
    if(cmp_ge_96(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 12 && threadIdx.x == 4){
mul_96_192_no_low3(&tmp96,u,f);
printf ("    f: %08X%08X%08X\r\n", f.d2, f.d1, f.d0);
printf ("u    : %X %X %X\r\n", u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X\r\n", tmp96.d2, tmp96.d1, tmp96.d0);
printf ("  rem: %08X%08X%08X\r\n", finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d0 == 2 && ((finalrem.d2 | finalrem.d1) == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=3;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*3 + 2]=f.d0;
	RES[index*3 + 3]=f.d1;
	RES[index*3 + 4]=f.d2;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 3);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=f.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d2;
  }
}


//
// Kernel to factor MM31, works on f between 90 and 96 bits inclusive (k = 2^58 to k = 2^64 (actual max is 2^63))
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett96_M31gs(int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int96 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int128 a, u, tmp128, negf;
    int224 b, tmp224;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = 2 * k * M31 + 1 = k * 2^32 - 2k + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;				// f = k * 2^32 + 1
    f.d1 = k.d0;
    f.d2 = k.d1;

    k.d0 = __add_cc (k.d0, k.d0);	// 2k
    k.d1 = __addc   (k.d1, k.d1);

    f.d0 = __sub_cc  (f.d0, k.d0);	// f -= 2k
    f.d1 = __subc_cc (f.d1, k.d1);
    f.d2 = __subc    (f.d2, 0);

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d2);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d1);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp224.d6 = 0x10000;				// tmp224 is 2^208
    tmp224.d5 = 0; tmp224.d4 = 0; tmp224.d3 = 0; tmp224.d2 = 0; tmp224.d1 = 0; tmp224.d0 = 0;

    // Could write optimized div_224_96 with so many tmp224 elements known to be zero
    div_224_96(&u,tmp224,f,ff);				// u = floor(2^208 / f).  This requires f >= 81 bits.

							// b_preinit = 2^128
							// Let a = b_preinit / 2^80 = 2^48
							// tmp128 = a * u / 2^128
							//        = (b_preinit / 2^80) * (2^208 / f) / 2^128  (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b_preinit / f
    tmp128.d0 = (u.d3 << 16) + (u.d2 >> 16);
    tmp128.d1 =                (u.d3 >> 16);

    negf.d0 = __sub_cc (0, f.d0);			// Negate f so that the mulsub macros can use multiply-accumulate instructions
    negf.d1 = __subc_cc(0, f.d1);
    negf.d2 = __subc_cc(0, f.d2);
    negf.d3 = __subc   (0, 0);

    mulsub_128_M31_initial_special(&a, tmp128, negf);	// Compute the remainder: b_preinit - quotient * f, we only compute the low 128-bits here

    for (shifter = 0; shifter < 24; shifter++)
    {
							// On input a is at most 99.17 bits (see end of this loop)

      square_128_224(&b, a);				// b = a^2, b is at most 198.34 bits

      a.d0 = (b.d3 << 16) + (b.d2 >> 16);		// a = b / 2^80, a is at most 118.34 bits
      a.d1 = (b.d4 << 16) + (b.d3 >> 16);
      a.d2 = (b.d5 << 16) + (b.d4 >> 16);
      a.d3 = (b.d6 << 16) + (b.d5 >> 16);

      mul_128_256_no_low4(&tmp128, a, u);		// tmp128 = (b / 2^80) * (2^208 / f) / 2^128    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 8.  A full mul_128_256 would add 7 partial results
							// into tmp256.d3 which could have generated 6 carries into tmp256.d4.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d3 could have been added into
							// tmp256.d3 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d3 could have been added into tmp256.d3 possibly generating a carry.
							// A grand total of up to 8 carries lost.

      mulsub_128_224(&a, b, tmp128, negf);		// Compute the remainder: b - quotient * f, we only compute the low 128-bits here

							// Since the quotient was up to 8 too small, the remainder has a maximum value of 9*f,
							// or 96 bits + log2 (9) bits, which is 99.17 bits.
    }

    mod_simple_128_96(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 8 times f.

#if 0
    if(cmp_ge_96(finalrem,f))
    {
      printf("EEEEEK, final rem is >= f\n");
    }
if ((blockIdx.x == 0 && threadIdx.x == 4)){
int128 f128;	    
f128.d0 = f.d0;
f128.d1 = f.d1;
f128.d2 = f.d2;
f128.d3 = 0;
mul_128_256_no_low4(&tmp128,u,f128);
printf ("    f: %08X%08X%08X\r\n", f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X\r\n", u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X\r\n", tmp128.d3, tmp128.d2, tmp128.d1, tmp128.d0);
printf ("  rem: %08X%08X%08X\r\n", finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d0 == 2 && (finalrem.d2 | finalrem.d1) == 0)
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=3;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*3 + 2]=f.d0;
	RES[index*3 + 3]=f.d1;
	RES[index*3 + 4]=f.d2;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 3);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=f.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d2;
  }
}
