/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static void mulsub_160_M107_initial_special(int160 *res, int160 a, int96 negk, int96 k2m1)
/* res = 0 - a * b (only lower 160 bits of the result), a.d4 is zero, where b is -negk * 2^96 - k2m1 */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %5, %10;\n\t"       /* 0 -= (a.d0 * -k2m1.d0).lo */
      "mul.lo.u32      %1, %5, %11;\n\t"       /* 0 -= (a.d0 * -k2m1.d1).lo */
      "mul.lo.u32      %2, %7, %10;\n\t"       /* 0 -= (a.d2 * -k2m1.d0).lo */
      "mul.lo.u32      %3, %7, %11;\n\t"       /* 0 -= (a.d2 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %1, %5, %10, %1;\n\t"   /* 0 -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %5, %11, %2;\n\t"   /* 0 -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %7, %10, %3;\n\t"   /* 0 -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.u32     %4, %7, %11, 0;\n\t"    /* 0 -= (a.d2 * -k2m1.d1).hi */

      "mad.lo.cc.u32   %1, %6, %10, %1;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %6, %11, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %3, %8, %10, %3;\n\t"   /* 0 -= (a.d3 * -k2m1.d0).lo */
      "madc.lo.u32     %4, %8, %11, %4;\n\t"   /* 0 -= (a.d3 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %2, %6, %10, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %3, %6, %11, %3;\n\t"   /* 0 -= (a.d1 * -k2m1.d1).hi */
      "madc.hi.u32     %4, %8, %10, %4;\n\t"   /* 0 -= (a.d3 * -k2m1.d0).hi */

      "mad.lo.cc.u32   %3, %5, %12, %3;\n\t"   /* 0 -= (a.d0 * -negk.d0).lo */
      "madc.lo.u32     %4, %5, %13, %4;\n\t"   /* 0 -= (a.d0 * -negk.d1).lo */

      "mad.hi.u32      %4, %5, %12, %4;\n\t"   /* 0 -= (a.d0 * -negk.d0).hi */

      "mad.lo.u32      %4, %6, %12, %4;\n\t"   /* 0 -= (a.d1 * -negk.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1));
}

__device__ static void mulsub_160_M107_special(int160 *res, int320 c, int160 a, int96 negk, int96 k2m1)
/* res = c - a * b (only lower 160 bits of the result), where b is -negk * 2^96 - k2m1 */
{
  asm("{\n\t"
      "mad.lo.cc.u32   %0, %5, %10, %14;\n\t"  /* c -= (a.d0 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %1, %5, %11, %15;\n\t"  /* c -= (a.d0 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %2, %7, %10, %16;\n\t"  /* c -= (a.d2 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %3, %7, %11, %17;\n\t"  /* c -= (a.d2 * -k2m1.d1).lo */
      "madc.lo.u32     %4, %9, %10, %18;\n\t"  /* c -= (a.d4 * -k2m1.d0).lo */

      "mad.hi.cc.u32   %1, %5, %10, %1;\n\t"   /* c -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %5, %11, %2;\n\t"   /* c -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %7, %10, %3;\n\t"   /* c -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.u32     %4, %7, %11, %4;\n\t"   /* c -= (a.d2 * -k2m1.d1).hi */

      "mad.lo.cc.u32   %1, %6, %10, %1;\n\t"   /* c -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %6, %11, %2;\n\t"   /* c -= (a.d1 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %3, %8, %10, %3;\n\t"   /* c -= (a.d3 * -k2m1.d0).lo */
      "madc.lo.u32     %4, %8, %11, %4;\n\t"   /* c -= (a.d3 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %2, %6, %10, %2;\n\t"   /* c -= (a.d1 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %3, %6, %11, %3;\n\t"   /* c -= (a.d1 * -k2m1.d1).hi */
      "madc.hi.u32     %4, %8, %10, %4;\n\t"   /* c -= (a.d3 * -k2m1.d0).hi */

      "mad.lo.cc.u32   %3, %5, %12, %3;\n\t"   /* c -= (a.d0 * -negk.d0).lo */
      "madc.lo.u32     %4, %5, %13, %4;\n\t"   /* c -= (a.d0 * -negk.d1).lo */

      "mad.hi.u32      %4, %5, %12, %4;\n\t"   /* c -= (a.d0 * -negk.d0).hi */

      "mad.lo.u32      %4, %6, %12, %4;\n\t"   /* c -= (a.d1 * -negk.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3), "r" (c.d4));
}

__device__ static void mulsub_192_M107_initial_special(int192 *res, int192 a, int96 negk, int96 k2m1)
/* res = 0 - a * b (only lower 192 bits of the result), a.d3, a.d4, a.d5 are zero, b is -negk * 2^96 - k2m1 */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %6, %12;\n\t"       /* 0 -= (a.d0 * -k2m1.d0).lo */
      "mul.lo.u32      %1, %6, %13;\n\t"       /* 0 -= (a.d0 * -k2m1.d1).lo */
      "mul.lo.u32      %2, %8, %12;\n\t"       /* 0 -= (a.d2 * -k2m1.d0).lo */

      "mad.lo.cc.u32   %1, %7, %12, %1;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %7, %13, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d1).lo */
      "madc.hi.u32     %3, %7, %13, 0;\n\t"    /* 0 -= (a.d1 * -k2m1.d1).hi */

      "mad.hi.cc.u32   %1, %6, %12, %1;\n\t"   /* 0 -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %6, %13, %2;\n\t"   /* 0 -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %8, %12, %3;\n\t"   /* 0 -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.u32     %4, %8, %13, 0;\n\t"    /* 0 -= (a.d2 * -k2m1.d1).hi */

      "mad.hi.cc.u32   %2, %7, %12, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).hi */
      "madc.lo.cc.u32  %3, %8, %13, %3;\n\t"   /* 0 -= (a.d2 * -k2m1.d1).lo */
      "madc.hi.cc.u32  %4, %6, %14, %4;\n\t"   /* 0 -= (a.d0 * -negk.d0).hi */
      "madc.hi.u32     %5, %6, %15, 0;\n\t"    /* 0 -= (a.d0 * -negk.d1).hi */

      "mad.lo.cc.u32   %3, %6, %14, %3;\n\t"   /* 0 -= (a.d0 * -negk.d0).lo */
      "madc.lo.cc.u32  %4, %6, %15, %4;\n\t"   /* 0 -= (a.d0 * -negk.d1).lo */
      "madc.lo.u32     %5, %6, %16, %5;\n\t"   /* 0 -= (a.d0 * -negk.d2).lo */

      "mad.lo.cc.u32   %4, %7, %14, %4;\n\t"   /* 0 -= (a.d1 * -negk.d0).lo */
      "madc.lo.u32     %5, %7, %15, %5;\n\t"   /* 0 -= (a.d1 * -negk.d1).lo */

      "mad.hi.u32      %5, %7, %14, %5;\n\t"   /* 0 -= (a.d1 * -negk.d0).hi */

      "mad.lo.u32      %5, %8, %14, %5;\n\t"   /* 0 -= (a.d2 * -negk.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1), "r" (negk.d2));
}

__device__ static void mulsub_192_M107_special(int192 *res, int352 c, int192 a, int96 negk, int96 k2m1)
/* res = c - a * b (only lower 192 bits of the result), where b is -negk * 2^96 - k2m1 */
{
  asm("{\n\t"
      "mad.lo.cc.u32   %0, %6, %12, %17;\n\t"  /* c -= (a.d0 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %1, %6, %13, %18;\n\t"  /* c -= (a.d0 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %2, %8, %12, %19;\n\t"  /* c -= (a.d2 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %3, %8, %13, %20;\n\t"  /* c -= (a.d2 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %4, %10, %12, %21;\n\t" /* c -= (a.d4 * -k2m1.d0).lo */
      "madc.lo.u32     %5, %10, %13, %22;\n\t" /* c -= (a.d4 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %1, %6, %12, %1;\n\t"   /* c -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %6, %13, %2;\n\t"   /* c -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %8, %12, %3;\n\t"   /* c -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %4, %8, %13, %4;\n\t"   /* c -= (a.d2 * -k2m1.d1).hi */
      "madc.hi.u32     %5, %10, %12, %5;\n\t"  /* c -= (a.d4 * -k2m1.d0).hi */

      "mad.lo.cc.u32   %1, %7, %12, %1;\n\t"   /* c -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %7, %13, %2;\n\t"   /* c -= (a.d1 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %3, %9, %12, %3;\n\t"   /* c -= (a.d3 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %4, %9, %13, %4;\n\t"   /* c -= (a.d3 * -k2m1.d1).lo */
      "madc.lo.u32     %5, %11, %12, %5;\n\t"  /* c -= (a.d5 * -k2m1.d0).lo */

      "mad.hi.cc.u32   %2, %7, %12, %2;\n\t"   /* c -= (a.d1 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %3, %7, %13, %3;\n\t"   /* c -= (a.d1 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %4, %9, %12, %4;\n\t"   /* c -= (a.d3 * -k2m1.d0).hi */
      "madc.hi.u32     %5, %9, %13, %5;\n\t"   /* c -= (a.d3 * -k2m1.d1).hi */

      "mad.lo.cc.u32   %3, %6, %14, %3;\n\t"   /* c -= (a.d0 * -negk.d0).lo */
      "madc.lo.cc.u32  %4, %6, %15, %4;\n\t"   /* c -= (a.d0 * -negk.d1).lo */
      "madc.lo.u32     %5, %6, %16, %5;\n\t"   /* c -= (a.d0 * -negk.d2).lo */

      "mad.hi.cc.u32   %4, %6, %14, %4;\n\t"   /* c -= (a.d0 * -negk.d0).hi */
      "madc.hi.u32     %5, %6, %15, %5;\n\t"   /* c -= (a.d0 * -negk.d1).hi */

      "mad.lo.cc.u32   %4, %7, %14, %4;\n\t"   /* c -= (a.d1 * -negk.d0).lo */
      "madc.lo.u32     %5, %7, %15, %5;\n\t"   /* c -= (a.d1 * -negk.d1).lo */

      "mad.hi.u32      %5, %7, %14, %5;\n\t"   /* c -= (a.d1 * -negk.d0).hi */

      "mad.lo.u32      %5, %8, %14, %5;\n\t"   /* c -= (a.d2 * -negk.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1), "r" (negk.d2),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3), "r" (c.d4), "r" (c.d5));
}


__device__ static void mulsub_192_160_M107_initial_special16(int192 *res, int192 a, int96 negk, int96 k2m1)
/* res = 0 - a * b (only lower 192 bits of the result), a.d4, a.d5 are zero, b is -negk * 2^96 - k2m1 and negk.d2 is 0xFFFFFFFF */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %6, %12;\n\t"       /* 0 -= (a.d0 * -k2m1.d0).lo */
      "mul.lo.u32      %1, %6, %13;\n\t"       /* 0 -= (a.d0 * -k2m1.d1).lo */
      "mul.lo.u32      %2, %8, %12;\n\t"       /* 0 -= (a.d2 * -k2m1.d0).lo */
      "mul.lo.u32      %3, %8, %13;\n\t"       /* 0 -= (a.d2 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %1, %6, %12, %1;\n\t"   /* 0 -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %6, %13, %2;\n\t"   /* 0 -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %8, %12, %3;\n\t"   /* 0 -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.u32     %4, %8, %13, 0;\n\t"    /* 0 -= (a.d2 * -k2m1.d1).hi */

      "mad.lo.cc.u32   %1, %7, %12, %1;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %7, %13, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %3, %9, %12, %3;\n\t"   /* 0 -= (a.d3 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %4, %9, %13, %4;\n\t"   /* 0 -= (a.d3 * -k2m1.d1).lo */
      "madc.hi.u32     %5, %9, %13, 0;\n\t"    /* 0 -= (a.d3 * -k2m1.d1).hi */

      "mad.hi.cc.u32   %2, %7, %12, %2;\n\t"   /* 0 -= (a.d1 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %3, %7, %13, %3;\n\t"   /* 0 -= (a.d1 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %4, %9, %12, %4;\n\t"   /* 0 -= (a.d3 * -k2m1.d0).hi */
      "madc.hi.u32     %5, %7, %14, %5;\n\t"   /* 0 -= (a.d1 * -negk.d0).hi */

      "mad.lo.cc.u32   %3, %6, %14, %3;\n\t"   /* 0 -= (a.d0 * -negk.d0).lo */
      "madc.lo.cc.u32  %4, %6, %15, %4;\n\t"   /* 0 -= (a.d0 * -negk.d1).lo */
      "madc.lo.u32     %5, %8, %14, %5;\n\t"   /* 0 -= (a.d2 * -negk.d0).lo */

      "mad.hi.cc.u32   %4, %6, %14, %4;\n\t"   /* 0 -= (a.d0 * -negk.d0).hi */
      "madc.hi.u32     %5, %6, %15, %5;\n\t"   /* 0 -= (a.d0 * -negk.d1).hi */

      "mad.lo.cc.u32   %4, %7, %14, %4;\n\t"   /* 0 -= (a.d1 * -negk.d0).lo */
      "madc.lo.u32     %5, %7, %15, %5;\n\t"   /* 0 -= (a.d1 * -negk.d1).lo */

      "sub.u32         %5, %5, %6;\n\t"        /* 0 -= (a.d0 * -negk.d2).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1));
}

__device__ static void mulsub_192_160_M107_special(int192 *res, int352 c, int192 a, int96 negk, int96 k2m1)
/* res = c - a * b (only lower 192 bits of the result), where b is -negk * 2^96 - k2m1 and negk.d2 is 0xFFFFFFFF */
{
  asm("{\n\t"
      "mad.lo.cc.u32   %0, %6, %12, %16;\n\t"  /* c -= (a.d0 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %1, %6, %13, %17;\n\t"  /* c -= (a.d0 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %2, %8, %12, %18;\n\t"  /* c -= (a.d2 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %3, %8, %13, %19;\n\t"  /* c -= (a.d2 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %4, %10, %12, %20;\n\t" /* c -= (a.d4 * -k2m1.d0).lo */
      "madc.lo.u32     %5, %10, %13, %21;\n\t" /* c -= (a.d4 * -k2m1.d1).lo */

      "mad.hi.cc.u32   %1, %6, %12, %1;\n\t"   /* c -= (a.d0 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %2, %6, %13, %2;\n\t"   /* c -= (a.d0 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %3, %8, %12, %3;\n\t"   /* c -= (a.d2 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %4, %8, %13, %4;\n\t"   /* c -= (a.d2 * -k2m1.d1).hi */
      "madc.hi.u32     %5, %10, %12, %5;\n\t"  /* c -= (a.d4 * -k2m1.d0).hi */

      "mad.lo.cc.u32   %1, %7, %12, %1;\n\t"   /* c -= (a.d1 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %2, %7, %13, %2;\n\t"   /* c -= (a.d1 * -k2m1.d1).lo */
      "madc.lo.cc.u32  %3, %9, %12, %3;\n\t"   /* c -= (a.d3 * -k2m1.d0).lo */
      "madc.lo.cc.u32  %4, %9, %13, %4;\n\t"   /* c -= (a.d3 * -k2m1.d1).lo */
      "madc.lo.u32     %5, %11, %12, %5;\n\t"  /* c -= (a.d5 * -k2m1.d0).lo */

      "mad.hi.cc.u32   %2, %7, %12, %2;\n\t"   /* c -= (a.d1 * -k2m1.d0).hi */
      "madc.hi.cc.u32  %3, %7, %13, %3;\n\t"   /* c -= (a.d1 * -k2m1.d1).hi */
      "madc.hi.cc.u32  %4, %9, %12, %4;\n\t"   /* c -= (a.d3 * -k2m1.d0).hi */
      "madc.hi.u32     %5, %9, %13, %5;\n\t"   /* c -= (a.d3 * -k2m1.d1).hi */

      "mad.lo.cc.u32   %3, %6, %14, %3;\n\t"   /* c -= (a.d0 * -negk.d0).lo */
      "madc.lo.cc.u32  %4, %6, %15, %4;\n\t"   /* c -= (a.d0 * -negk.d1).lo */
      "madc.lo.u32     %5, %8, %14, %5;\n\t"   /* c -= (a.d2 * -negk.d0).lo */

      "mad.hi.cc.u32   %4, %6, %14, %4;\n\t"   /* c -= (a.d0 * -negk.d0).hi */
      "madc.hi.u32     %5, %6, %15, %5;\n\t"   /* c -= (a.d0 * -negk.d1).hi */

      "mad.lo.cc.u32   %4, %7, %14, %4;\n\t"   /* c -= (a.d1 * -negk.d0).lo */
      "madc.lo.u32     %5, %7, %15, %5;\n\t"   /* c -= (a.d1 * -negk.d1).lo */

      "mad.hi.u32      %5, %7, %14, %5;\n\t"   /* c -= (a.d1 * -negk.d0).hi */

      "sub.u32         %5, %5, %6;\n\t"        /* c -= (a.d0 * -negk.d2).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5),
        "r" (k2m1.d0), "r" (k2m1.d1), "r" (negk.d0), "r" (negk.d1),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3), "r" (c.d4), "r" (c.d5));
}


//
// Kernel to factor MM107, works on f between 129 and 152 bits inclusive (k = 2^21 to k = 2^43)
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MAX_BLOCKS) mfaktc_barrett152_M107gs(int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int160 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k, k2m1;
    int160 a, u, tmp160;
    int320 b, tmp320;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = 2 * k * M107 + 1 = k * 2^108 - 2k + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    k2m1.d0 = __add_cc (k.d0, k.d0);	// 2k-1
    k2m1.d1 = __addc   (k.d1, k.d1);
    k2m1.d0 = __sub_cc (k2m1.d0, 1);
    k2m1.d1 = __subc   (k2m1.d1, 0);

    k.d1 = (k.d1 << 12) + (k.d0 >> 20);	// Shift k up 12 bits
    k.d0 = (k.d0 << 12);

    f.d0 = __sub_cc (0, k2m1.d0);	// 1-2k
    f.d1 = __subc   (0, k2m1.d1);
    f.d2 = 0xFFFFFFFF;
    f.d3 = __sub_cc (k.d0, 1);		// k * 2^108 - propagated carry
    f.d4 = __subc   (k.d1, 0);

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d4);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d3);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp320.d9 = 1 << (bits_max - 1);			// tmp320 = 2^(159 + bits_in_f)
    tmp320.d8 = 0; tmp320.d7 = 0; tmp320.d6 = 0; tmp320.d5 = 0; tmp320.d4 = 0;
    tmp320.d3 = 0; tmp320.d2 = 0; tmp320.d1 = 0; tmp320.d0 = 0;

    // Could write optimized div_320_160 with so many tmp320 elements known to be zero
    div_320_160(&u,tmp320,f,ff);			// u = floor(2^(159 + bits_in_f) / f), giving 160 bits of precision, requires f >= 129 bits

							// b_preinit = 2^256
							// Let a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp160 = a * u / 2^160
							//        = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (159 + bits_in_f) / f) / 2^160   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b_preinit / f
    tmp160.d0 = (u.d1 >> (bits_max - 1)) + (u.d2 << (32 - (bits_max - 1)));
    tmp160.d1 = (u.d2 >> (bits_max - 1)) + (u.d3 << (32 - (bits_max - 1)));
    tmp160.d2 = (u.d3 >> (bits_max - 1)) + (u.d4 << (32 - (bits_max - 1)));
    tmp160.d3 = (u.d4 >> (bits_max - 1));

    k.d0 = __sub_cc (0, k.d0);				// Negate k so that the mulsub macros can use multiply-accumulate instructions
    k.d1 = __subc   (0, k.d1);

    mulsub_160_M107_initial_special(&a, tmp160, k, k2m1); // Compute the remainder: b_preinit - quotient * f, we only compute the low 160-bits here

    for (shifter = 0; shifter < 99; shifter++)
    {
							// On input a is at most 155.459 bits (see end of this loop)

      square_160_320(&b, a);				// b = a^2, b is at most 310.918 bits

      a.d0 = (b.d4 >> (bits_max - 1)) + (b.d5 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 159.918 bits
      a.d1 = (b.d5 >> (bits_max - 1)) + (b.d6 << (32 - (bits_max - 1)));
      a.d2 = (b.d6 >> (bits_max - 1)) + (b.d7 << (32 - (bits_max - 1)));
      a.d3 = (b.d7 >> (bits_max - 1)) + (b.d8 << (32 - (bits_max - 1)));
      a.d4 = (b.d8 >> (bits_max - 1)) + (b.d9 << (32 - (bits_max - 1)));

      mul_160_320_no_low5(&tmp160, a, u);		// tmp160 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (159 + bits_in_f) / f) / 2^160   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 10.  A full mul_160_320 would add 9 partial results
							// into tmp320.d4 which could have generated 8 carries into tmp320.d5.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d4 could have been added into
							// tmp320.d4 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d4 could have been added into tmp320.d4 possibly generating a carry.
							// A grand total of up to 10 carries lost.

      mulsub_160_M107_special(&a, b, tmp160, k, k2m1);	// Compute the remainder: b - quotient * f, we only compute the low 160-bits here
      
							// Since the quotient was up to 10 too small, the remainder has a maximum value of 11*f,
							// or 152 bits + log2 (11) bits, which is 155.459 bits.
    }

    mod_simple_160(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_160(finalrem,f) && f.d4)
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_160_320_no_low5(&tmp160,u,f);
printf ("    f: %08X%08X%08X%08X%08X\r\n", f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X\r\n", u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X\r\n", tmp160.d4, tmp160.d3, tmp160.d2, tmp160.d1, tmp160.d0);
printf ("  rem: %08X%08X%08X%08X%08X\r\n", finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d0 == 2 && ((finalrem.d4 | finalrem.d3 | finalrem.d2 | finalrem.d1) == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=5;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*5 + 2]=f.d0;
	RES[index*5 + 3]=f.d1;
	RES[index*5 + 4]=0xFFFFFFFF;
	RES[index*5 + 5]=f.d3;
	RES[index*5 + 6]=f.d4;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 5);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=f.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0xFFFFFFFF;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d4;
  }
}


//
// Kernel to factor MM107, works on f between 152 and 160 bits inclusive (k = 2^44 to k = 2^52)
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett160_M107gs(int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int160 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k, k2m1;
    int192 a, u, tmp192;
    int352 b, tmp352;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = 2 * k * M107 + 1 = k * 2^108 - 2k + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    k2m1.d0 = __add_cc (k.d0, k.d0);	// 2k-1
    k2m1.d1 = __addc   (k.d1, k.d1);
    k2m1.d0 = __sub_cc (k2m1.d0, 1);
    k2m1.d1 = __subc   (k2m1.d1, 0);

    k.d1 = (k.d1 << 12) + (k.d0 >> 20);	// Shift k up 12 bits
    k.d0 = (k.d0 << 12);

    f.d0 = __sub_cc (0, k2m1.d0);	// 1-2k
    f.d1 = __subc   (0, k2m1.d1);
    f.d2 = 0xFFFFFFFF;
    f.d3 = __sub_cc (k.d0, 1);		// k * 2^108 - propagated carry
    f.d4 = __subc   (k.d1, 0);

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d4);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d3);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp352.d10 = 0x00010000;				// tmp352 is 2^336
    tmp352.d9 = 0; tmp352.d8 = 0; tmp352.d7 = 0; tmp352.d6 = 0; tmp352.d5 = 0;
    tmp352.d4 = 0; tmp352.d3 = 0; tmp352.d2 = 0; tmp352.d1 = 0; tmp352.d0 = 0;

    // Could write optimized div_352_160 with so many tmp352 elements known to be zero
    div_352_160(&u,tmp352,f,ff);			// u = floor(2^336 / f).  This requires f >= 145 bits.

							// b_preinit = 2^256
							// Let a = b_preinit / 2^144 = 2^112
							// tmp192 = a * u / 2^192
							//        = (b_preinit / 2^144) * (2^336 / f) / 2^192    (ignore the floor functions for now)
    tmp192.d0 = (u.d3 << 16) + (u.d2 >> 16);		// which if we do the math simplifies to the quotient: b_preinit / f
    tmp192.d1 = (u.d4 << 16) + (u.d3 >> 16);
    tmp192.d2 = (u.d5 << 16) + (u.d4 >> 16);
    tmp192.d3 =                (u.d5 >> 16);

    k.d0 = __sub_cc (0, k.d0);				// Negate k so that the mulsub macros can use multiply-accumulate instructions
    k.d1 = __subc   (0, k.d1);

    mulsub_192_160_M107_initial_special16(&a, tmp192, k, k2m1); // Compute the remainder: b_preinit - quotient * f, we only compute the low 192-bits here

    for (shifter = 0; shifter < 99; shifter++)
    {
							// On input a is at most 163.700 bits (see end of this loop)

      square_192_352(&b, a);				// b = a^2, b is at most 327.400 bits

      a.d0 = (b.d5 << 16) + (b.d4 >> 16);		// a = b / 2^144, a is at most 183.400 bits
      a.d1 = (b.d6 << 16) + (b.d5 >> 16);
      a.d2 = (b.d7 << 16) + (b.d6 >> 16);
      a.d3 = (b.d8 << 16) + (b.d7 >> 16);
      a.d4 = (b.d9 << 16) + (b.d8 >> 16);
      a.d5 = (b.d10 << 16) + (b.d9 >> 16);

      mul_192_384_no_low6(&tmp192, a, u);		// tmp192 = (b / 2^144) * (2^336 / f) / 2^192     (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 12.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.

      mulsub_192_160_M107_special(&a, b, tmp192, k, k2m1); // Compute the remainder: b - quotient * f, we only compute the low 192-bits here

							// Since the quotient was up to 12 too small, the remainder has a maximum value of 13*f,
							// or 160 bits + log2 (13) bits, which is 163.700 bits.
    }

    mod_simple_192_160(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_160(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 12 && threadIdx.x == 4){
int192 f192;	    
f192.d0 = f.d0;
f192.d1 = f.d1;
f192.d2 = f.d2;
f192.d3 = f.d3;
f192.d4 = f.d4;
f192.d5 = 0;
mul_192_384_no_low6(&tmp192,u,f192);
printf ("    f: %08X%08X%08X%08X%08X\r\n", f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X\r\n", finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d0 == 2 && ((finalrem.d4 | finalrem.d3 | finalrem.d2 | finalrem.d1) == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=5;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*5 + 2]=f.d0;
	RES[index*5 + 3]=f.d1;
	RES[index*5 + 4]=0xFFFFFFFF;
	RES[index*5 + 5]=f.d3;
	RES[index*5 + 6]=f.d4;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 5);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=f.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0xFFFFFFFF;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d4;
  }
}


//
// Kernel to factor MM107, works on f between 161 and 172 bits inclusive (k = 2^53 to k = 2^64 (actual max is 2^63))
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett172_M107gs(int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int192 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k, k2m1;
    int192 a, u, tmp192;
    int352 b, tmp352;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = 2 * k * M107 + 1 = k * 2^108 - 2k + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    k2m1.d0 = __add_cc (k.d0, k.d0);	// 2k-1
    k2m1.d1 = __addc   (k.d1, k.d1);
    k2m1.d0 = __sub_cc (k2m1.d0, 1);
    k2m1.d1 = __subc   (k2m1.d1, 0);

    k.d2 =                (k.d1 >> 20);	// Shift k up 12 bits
    k.d1 = (k.d1 << 12) + (k.d0 >> 20);
    k.d0 = (k.d0 << 12);

    f.d0 = __sub_cc (0, k2m1.d0);	// 1-2k
    f.d1 = __subc   (0, k2m1.d1);
    f.d2 = 0xFFFFFFFF;
    f.d3 = __sub_cc (k.d0, 1);		// k * 2^108 - propagated carry
    f.d4 = __subc_cc(k.d1, 0);
    f.d5 = __subc   (k.d2, 0);

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d5);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d4);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp352.d10 = 0xFFFFFFFF;				// tmp352 is nearly 2^352
    tmp352.d9 = 0xFFFFFFFF; tmp352.d8 = 0xFFFFFFFF; tmp352.d7 = 0xFFFFFFFF; tmp352.d6 = 0xFFFFFFFF; tmp352.d5 = 0xFFFFFFFF;
    tmp352.d4 = 0xFFFFFFFF; tmp352.d3 = 0xFFFFFFFF; tmp352.d2 = 0xFFFFFFFF; tmp352.d1 = 0xFFFFFFFF; tmp352.d0 = 0xFFFFFFFF;

    // Could write optimized div_352_192 with so many tmp352 elements known to be zero
    div_352_192(&u,tmp352,f,ff);			// u = floor(2^352 / f).  This requires f >= 161 bits.

							// b_preinit = 2^256
							// Let a = b_preinit / 2^160 = 2^96
							// tmp192 = a * u / 2^192
							//        = (b_preinit / 2^160) * (2^352 / f) / 2^192   (ignore the floor functions for now)
    tmp192.d0 = u.d3;					// which if we do the math simplifies to the quotient: b_preinit / f
    tmp192.d1 = u.d4;
    tmp192.d2 = u.d5;

    k.d0 = __sub_cc (0, k.d0);				// Negate k so that the mulsub macros can use multiply-accumulate instructions
    k.d1 = __subc_cc(0, k.d1);
    k.d2 = __subc   (0, k.d2);

    mulsub_192_M107_initial_special(&a, tmp192, k, k2m1); // Compute the remainder: b_preinit - quotient * f, we only compute the low 192-bits here

    for (shifter = 0; shifter < 99; shifter++)
    {
							// On input a is at most 175.700 bits (see end of this loop)

      square_192_352(&b, a);				// b = a^2, b is at most 351.400 bits

      a.d0 = b.d5;					// a = b / 2^160, a is at most 191.400 bits
      a.d1 = b.d6;
      a.d2 = b.d7;
      a.d3 = b.d8;
      a.d4 = b.d9;
      a.d5 = b.d10;

      mul_192_384_no_low6(&tmp192, a, u);		// tmp192 = (b / 2^160) * (2^352 / f) / 2^192    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 12.  A full mul_192_384 would add 11 partial results
							// into tmp384.d5 which could have generated 10 carries into tmp384.d6.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d5 could have been added into
							// tmp384.d5 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d5 could have been added into tmp384.d5 possibly generating a carry.
							// A grand total of up to 12 carries lost.

      mulsub_192_M107_special(&a, b, tmp192, k, k2m1);	// Compute the remainder: b - quotient * f, we only compute the low 192-bits here

							// Since the quotient was up to 12 too small, the remainder has a maximum value of 13*f,
							// or 172 bits + log2 (13) bits, which is 175.700 bits.
    }

    mod_simple_192(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 13 times f.

#if 0
    if(cmp_ge_192(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 12 && threadIdx.x == 4){
mul_192_384_no_low6(&tmp192,u,f);
printf ("    f: %08X%08X%08X%08X%08X%08X\r\n", f.d5, f.d4, f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X %X\r\n", u.d5, u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X %X\r\n", tmp192.d5, tmp192.d4, tmp192.d3, tmp192.d2, tmp192.d1, tmp192.d0);
printf ("  rem: %08X%08X%08X%08X%08X%08X\r\n", finalrem.d5, finalrem.d4, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if(finalrem.d0 == 2 && ((finalrem.d5 | finalrem.d4 | finalrem.d3 | finalrem.d2 | finalrem.d1) == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=6;
      if(index<10)				/* limit to 10 factors per class */
      {
        RES[index*6 + 2]=f.d0;
	RES[index*6 + 3]=f.d1;
	RES[index*6 + 4]=0xFFFFFFFF;
	RES[index*6 + 5]=f.d3;
	RES[index*6 + 6]=f.d4;
	RES[index*6 + 7]=f.d5;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 6);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=f.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=0xFFFFFFFF;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=f.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=f.d5;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+9]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+10]=finalrem.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+11]=finalrem.d4;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+12]=finalrem.d5;
  }
}
