/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/

/* 256 - 480 - 512-bit extensions are built from tf_224.h routines; S.Batalov 2012, 2014 */

__device__ static int cmp_ge_256(int256 a, int256 b)
/* checks if a is greater or equal than b */
{
  if(a.d7 != b.d7) return(a.d7 > b.d7);
  if(a.d6 != b.d6) return(a.d6 > b.d6);
  if(a.d5 != b.d5) return(a.d5 > b.d5);
  if(a.d4 != b.d4) return(a.d4 > b.d4);
  if(a.d3 != b.d3) return(a.d3 > b.d3);
  if(a.d2 != b.d2) return(a.d2 > b.d2);
  if(a.d1 != b.d1) return(a.d1 > b.d1);
  return(a.d0 >= b.d0);
}


__device__ static void sub_256(int256 *res, int256 a, int256 b)
/* a must be greater or equal b!
res = a - b */
{
  res->d0 = __sub_cc (a.d0, b.d0);
  res->d1 = __subc_cc(a.d1, b.d1);
  res->d2 = __subc_cc(a.d2, b.d2);
  res->d3 = __subc_cc(a.d3, b.d3);
  res->d4 = __subc_cc(a.d4, b.d4);
  res->d5 = __subc_cc(a.d5, b.d5);
  res->d6 = __subc_cc(a.d6, b.d6);
  res->d7 = __subc   (a.d7, b.d7);
}


__device__ static void square_256_512(int512 *res, int256 a)
/* res = a^2, assuming that a is < 2^255 (a.d7 < 2^31)! */
{
  asm("{\n\t"
      "mul.lo.u32      %1, %16, %17;\n\t"     /* (a.d0 * a.d1).lo */
      "mul.lo.u32      %2, %16, %18;\n\t"     /* (a.d0 * a.d2).lo */
      "mul.lo.u32      %3, %16, %19;\n\t"     /* (a.d0 * a.d3).lo */
      "mul.lo.u32      %4, %16, %20;\n\t"     /* (a.d0 * a.d4).lo */
      "mul.lo.u32      %5, %16, %21;\n\t"     /* (a.d0 * a.d5).lo */
      "mul.lo.u32      %6, %16, %22;\n\t"     /* (a.d0 * a.d6).lo */
      "mul.lo.u32      %7, %16, %23;\n\t"     /* (a.d0 * a.d7).lo */

      "mad.hi.cc.u32   %2, %16, %17, %2;\n\t" /* (a.d0 * a.d1).hi */
      "madc.hi.cc.u32  %3, %16, %18, %3;\n\t" /* (a.d0 * a.d2).hi */
      "madc.hi.cc.u32  %4, %16, %19, %4;\n\t" /* (a.d0 * a.d3).hi */
      "madc.hi.cc.u32  %5, %16, %20, %5;\n\t" /* (a.d0 * a.d4).hi */
      "madc.hi.cc.u32  %6, %16, %21, %6;\n\t" /* (a.d0 * a.d5).hi */
      "madc.hi.cc.u32  %7, %16, %22, %7;\n\t" /* (a.d0 * a.d6).hi */
      "madc.hi.u32     %8, %16, %23, 0;\n\t"  /* (a.d0 * a.d7).hi */

      "mad.lo.cc.u32   %3, %17, %18, %3;\n\t" /* (a.d1 * a.d2).lo */
      "madc.hi.cc.u32  %4, %17, %18, %4;\n\t" /* (a.d1 * a.d2).hi */
      "madc.hi.cc.u32  %5, %17, %19, %5;\n\t" /* (a.d1 * a.d3).hi */
      "madc.hi.cc.u32  %6, %17, %20, %6;\n\t" /* (a.d1 * a.d4).hi */
      "madc.hi.cc.u32  %7, %17, %21, %7;\n\t" /* (a.d1 * a.d5).hi */
      "madc.hi.cc.u32  %8, %17, %22, %8;\n\t" /* (a.d1 * a.d6).hi */
      "madc.hi.u32     %9, %17, %23, 0;\n\t"  /* (a.d1 * a.d7).hi */

      "mad.lo.cc.u32   %4, %17, %19, %4;\n\t" /* (a.d1 * a.d3).lo */
      "madc.lo.cc.u32  %5, %17, %20, %5;\n\t" /* (a.d1 * a.d4).lo */
      "madc.lo.cc.u32  %6, %17, %21, %6;\n\t" /* (a.d1 * a.d5).lo */
      "madc.lo.cc.u32  %7, %17, %22, %7;\n\t" /* (a.d1 * a.d6).lo */
      "madc.lo.cc.u32  %8, %17, %23, %8;\n\t" /* (a.d1 * a.d7).lo */
      "madc.hi.cc.u32  %9, %18, %22, %9;\n\t" /* (a.d2 * a.d6).hi */
      "madc.hi.u32     %10, %18, %23, 0;\n\t" /* (a.d2 * a.d7).hi */

      "mad.lo.cc.u32   %5, %18, %19, %5;\n\t" /* (a.d2 * a.d3).lo */
      "madc.lo.cc.u32  %6, %18, %20, %6;\n\t" /* (a.d2 * a.d4).lo */
      "madc.lo.cc.u32  %7, %18, %21, %7;\n\t" /* (a.d2 * a.d5).lo */
      "madc.lo.cc.u32  %8, %18, %22, %8;\n\t" /* (a.d2 * a.d6).lo */
      "madc.lo.cc.u32  %9, %18, %23, %9;\n\t" /* (a.d2 * a.d7).lo */
      "madc.hi.cc.u32  %10,%19, %22, %10;\n\t"/* (a.d3 * a.d6).hi */
      "madc.hi.u32     %11,%19, %23, 0;\n\t"  /* (a.d3 * a.d7).hi */

      "mad.hi.cc.u32   %6, %18, %19, %6;\n\t"  /* (a.d2 * a.d3).hi */
      "madc.hi.cc.u32  %7, %18, %20, %7;\n\t"  /* (a.d2 * a.d4).hi */
      "madc.hi.cc.u32  %8, %18, %21, %8;\n\t"  /* (a.d2 * a.d5).hi */
      "madc.lo.cc.u32  %9, %20, %21, %9;\n\t"  /* (a.d4 * a.d5).lo */
      "madc.lo.cc.u32  %10,%20, %22, %10;\n\t" /* (a.d4 * a.d6).lo */
      "madc.lo.cc.u32  %11,%20, %23, %11;\n\t" /* (a.d4 * a.d7).lo */
      "madc.hi.u32     %12,%20, %23, 0;\n\t"   /* (a.d4 * a.d7).hi */

      "mad.lo.cc.u32   %7, %19, %20, %7;\n\t"  /* (a.d3 * a.d4).lo */
      "madc.lo.cc.u32  %8, %19, %21, %8;\n\t"  /* (a.d3 * a.d5).lo */
      "madc.lo.cc.u32  %9, %19, %22, %9;\n\t"  /* (a.d3 * a.d6).lo */
      "madc.lo.cc.u32  %10,%19, %23, %10;\n\t" /* (a.d3 * a.d7).lo */
      "madc.lo.cc.u32  %11,%21, %22, %11;\n\t" /* (a.d5 * a.d6).lo */
      "madc.lo.cc.u32  %12,%21, %23, %12;\n\t" /* (a.d5 * a.d7).lo */
      "madc.hi.u32     %13,%21, %23, 0;\n\t"   /* (a.d5 * a.d7).hi */

      "mad.hi.cc.u32   %8, %19, %20, %8;\n\t"  /* (a.d3 * a.d4).hi */
      "madc.hi.cc.u32  %9, %19, %21, %9;\n\t"  /* (a.d3 * a.d5).hi */
      "madc.hi.cc.u32  %10,%20, %21, %10;\n\t" /* (a.d4 * a.d5).hi */
      "madc.hi.cc.u32  %11,%20, %22, %11;\n\t" /* (a.d4 * a.d6).hi */
      "madc.hi.cc.u32  %12,%21, %22, %12;\n\t" /* (a.d5 * a.d6).hi */
      "madc.lo.cc.u32  %13,%22, %23, %13;\n\t" /* (a.d6 * a.d7).lo */
      "madc.hi.u32     %14,%22, %23, 0;\n\t"   /* (a.d6 * a.d7).hi */

      "add.cc.u32      %1, %1, %1;\n\t"       /* Double the partial results */
      "addc.cc.u32     %2, %2, %2;\n\t"
      "addc.cc.u32     %3, %3, %3;\n\t"
      "addc.cc.u32     %4, %4, %4;\n\t"
      "addc.cc.u32     %5, %5, %5;\n\t"
      "addc.cc.u32     %6, %6, %6;\n\t"
      "addc.cc.u32     %7, %7, %7;\n\t"
      "addc.cc.u32     %8, %8, %8;\n\t"
      "addc.cc.u32     %9, %9, %9;\n\t"
      "addc.cc.u32     %10, %10, %10;\n\t"
      "addc.cc.u32     %11, %11, %11;\n\t"
      "addc.cc.u32     %12, %12, %12;\n\t"
      "addc.cc.u32     %13, %13, %13;\n\t"
      "addc.u32        %14, %14, %14;\n\t"

      "mul.lo.u32      %0, %16, %16;\n\t"       /* (a.d0 * a.d0).lo */
      "mad.hi.cc.u32   %1, %16, %16, %1;\n\t"   /* (a.d0 * a.d0).hi */
      "madc.lo.cc.u32  %2, %17, %17, %2;\n\t"   /* (a.d1 * a.d1).lo */
      "madc.hi.cc.u32  %3, %17, %17, %3;\n\t"   /* (a.d1 * a.d1).hi */
      "madc.lo.cc.u32  %4, %18, %18, %4;\n\t"   /* (a.d2 * a.d2).lo */
      "madc.hi.cc.u32  %5, %18, %18, %5;\n\t"   /* (a.d2 * a.d2).hi */
      "madc.lo.cc.u32  %6, %19, %19, %6;\n\t"   /* (a.d3 * a.d3).lo */
      "madc.hi.cc.u32  %7, %19, %19, %7;\n\t"   /* (a.d3 * a.d3).hi */
      "madc.lo.cc.u32  %8, %20, %20, %8;\n\t"   /* (a.d4 * a.d4).lo */
      "madc.hi.cc.u32  %9, %20, %20, %9;\n\t"   /* (a.d4 * a.d4).hi */
      "madc.lo.cc.u32  %10, %21, %21, %10;\n\t" /* (a.d5 * a.d5).lo */
      "madc.hi.cc.u32  %11, %21, %21, %11;\n\t" /* (a.d5 * a.d5).hi */
      "madc.lo.cc.u32  %12, %22, %22, %12;\n\t" /* (a.d6 * a.d6).lo */
      "madc.hi.cc.u32  %13, %22, %22, %13;\n\t" /* (a.d6 * a.d6).hi */
      "madc.lo.cc.u32  %14, %23, %23, %14;\n\t" /* (a.d7 * a.d7).lo */
      "madc.hi.u32     %15, %23, %23, 0;\n\t"   /* (a.d7 * a.d7).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5),
        "=r" (res->d6), "=r" (res->d7), "=r" (res->d8), "=r" (res->d9), "=r" (res->d10), "=r" (res->d11), 
        "=r" (res->d12), "=r" (res->d13), "=r" (res->d14), "=r" (res->d15)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7));
}


__device__ static void square_256_480(int480 *res, int256 a)
/* res = a^2, assuming that a is < 2^224 (a.d7 < 2^16)! */
{
  asm("{\n\t"
      "mul.lo.u32      %1, %15, %16;\n\t"     /* (a.d0 * a.d1).lo */
      "mul.lo.u32      %2, %15, %17;\n\t"     /* (a.d0 * a.d2).lo */
      "mul.lo.u32      %3, %15, %18;\n\t"     /* (a.d0 * a.d3).lo */
      "mul.lo.u32      %4, %15, %19;\n\t"     /* (a.d0 * a.d4).lo */
      "mul.lo.u32      %5, %15, %20;\n\t"     /* (a.d0 * a.d5).lo */
      "mul.lo.u32      %6, %15, %21;\n\t"     /* (a.d0 * a.d6).lo */
      "mul.lo.u32      %7, %15, %22;\n\t"     /* (a.d0 * a.d7).lo */

      "mad.hi.cc.u32   %2, %15, %16, %2;\n\t" /* (a.d0 * a.d1).hi */
      "madc.hi.cc.u32  %3, %15, %17, %3;\n\t" /* (a.d0 * a.d2).hi */
      "madc.hi.cc.u32  %4, %15, %18, %4;\n\t" /* (a.d0 * a.d3).hi */
      "madc.hi.cc.u32  %5, %15, %19, %5;\n\t" /* (a.d0 * a.d4).hi */
      "madc.hi.cc.u32  %6, %15, %20, %6;\n\t" /* (a.d0 * a.d5).hi */
      "madc.hi.cc.u32  %7, %15, %21, %7;\n\t" /* (a.d0 * a.d6).hi */
      "madc.hi.u32     %8, %15, %22, 0;\n\t"  /* (a.d0 * a.d7).hi */

      "mad.lo.cc.u32   %3, %16, %17, %3;\n\t" /* (a.d1 * a.d2).lo */
      "madc.hi.cc.u32  %4, %16, %17, %4;\n\t" /* (a.d1 * a.d2).hi */
      "madc.hi.cc.u32  %5, %16, %18, %5;\n\t" /* (a.d1 * a.d3).hi */
      "madc.hi.cc.u32  %6, %16, %19, %6;\n\t" /* (a.d1 * a.d4).hi */
      "madc.hi.cc.u32  %7, %16, %20, %7;\n\t" /* (a.d1 * a.d5).hi */
      "madc.hi.cc.u32  %8, %16, %21, %8;\n\t" /* (a.d1 * a.d6).hi */
      "madc.hi.u32     %9, %16, %22, 0;\n\t"  /* (a.d1 * a.d7).hi */

      "mad.lo.cc.u32   %4, %16, %18, %4;\n\t" /* (a.d1 * a.d3).lo */
      "madc.lo.cc.u32  %5, %16, %19, %5;\n\t" /* (a.d1 * a.d4).lo */
      "madc.lo.cc.u32  %6, %16, %20, %6;\n\t" /* (a.d1 * a.d5).lo */
      "madc.lo.cc.u32  %7, %16, %21, %7;\n\t" /* (a.d1 * a.d6).lo */
      "madc.lo.cc.u32  %8, %16, %22, %8;\n\t" /* (a.d1 * a.d7).lo */
      "madc.hi.cc.u32  %9, %17, %21, %9;\n\t" /* (a.d2 * a.d6).hi */
      "madc.hi.u32     %10, %17, %22, 0;\n\t" /* (a.d2 * a.d7).hi */

      "mad.lo.cc.u32   %5, %17, %18, %5;\n\t" /* (a.d2 * a.d3).lo */
      "madc.lo.cc.u32  %6, %17, %19, %6;\n\t" /* (a.d2 * a.d4).lo */
      "madc.lo.cc.u32  %7, %17, %20, %7;\n\t" /* (a.d2 * a.d5).lo */
      "madc.lo.cc.u32  %8, %17, %21, %8;\n\t" /* (a.d2 * a.d6).lo */
      "madc.lo.cc.u32  %9, %17, %22, %9;\n\t" /* (a.d2 * a.d7).lo */
      "madc.hi.cc.u32  %10,%18, %21, %10;\n\t"/* (a.d3 * a.d6).hi */
      "madc.hi.u32     %11,%18, %22, 0;\n\t"  /* (a.d3 * a.d7).hi */

      "mad.hi.cc.u32   %6, %17, %18, %6;\n\t"  /* (a.d2 * a.d3).hi */
      "madc.hi.cc.u32  %7, %17, %19, %7;\n\t"  /* (a.d2 * a.d4).hi */
      "madc.hi.cc.u32  %8, %17, %20, %8;\n\t"  /* (a.d2 * a.d5).hi */
      "madc.lo.cc.u32  %9, %19, %20, %9;\n\t"  /* (a.d4 * a.d5).lo */
      "madc.lo.cc.u32  %10,%19, %21, %10;\n\t" /* (a.d4 * a.d6).lo */
      "madc.lo.cc.u32  %11,%19, %22, %11;\n\t" /* (a.d4 * a.d7).lo */
      "madc.hi.u32     %12,%19, %22, 0;\n\t"   /* (a.d4 * a.d7).hi */

      "mad.lo.cc.u32   %7, %18, %19, %7;\n\t"  /* (a.d3 * a.d4).lo */
      "madc.lo.cc.u32  %8, %18, %20, %8;\n\t"  /* (a.d3 * a.d5).lo */
      "madc.lo.cc.u32  %9, %18, %21, %9;\n\t"  /* (a.d3 * a.d6).lo */
      "madc.lo.cc.u32  %10,%18, %22, %10;\n\t" /* (a.d3 * a.d7).lo */
      "madc.lo.cc.u32  %11,%20, %21, %11;\n\t" /* (a.d5 * a.d6).lo */
      "madc.lo.cc.u32  %12,%20, %22, %12;\n\t" /* (a.d5 * a.d7).lo */
      "madc.hi.u32     %13,%20, %22, 0;\n\t"   /* (a.d5 * a.d7).hi */

      "mad.hi.cc.u32   %8, %18, %19, %8;\n\t"  /* (a.d3 * a.d4).hi */
      "madc.hi.cc.u32  %9, %18, %20, %9;\n\t"  /* (a.d3 * a.d5).hi */
      "madc.hi.cc.u32  %10,%19, %20, %10;\n\t" /* (a.d4 * a.d5).hi */
      "madc.hi.cc.u32  %11,%19, %21, %11;\n\t" /* (a.d4 * a.d6).hi */
      "madc.hi.cc.u32  %12,%20, %21, %12;\n\t" /* (a.d5 * a.d6).hi */
      "madc.lo.cc.u32  %13,%21, %22, %13;\n\t" /* (a.d6 * a.d7).lo */
      "madc.hi.u32     %14,%21, %22, 0;\n\t"   /* (a.d6 * a.d7).hi */

      "add.cc.u32      %1, %1, %1;\n\t"       /* Double the partial results */
      "addc.cc.u32     %2, %2, %2;\n\t"
      "addc.cc.u32     %3, %3, %3;\n\t"
      "addc.cc.u32     %4, %4, %4;\n\t"
      "addc.cc.u32     %5, %5, %5;\n\t"
      "addc.cc.u32     %6, %6, %6;\n\t"
      "addc.cc.u32     %7, %7, %7;\n\t"
      "addc.cc.u32     %8, %8, %8;\n\t"
      "addc.cc.u32     %9, %9, %9;\n\t"
      "addc.cc.u32     %10, %10, %10;\n\t"
      "addc.cc.u32     %11, %11, %11;\n\t"
      "addc.cc.u32     %12, %12, %12;\n\t"
      "addc.cc.u32     %13, %13, %13;\n\t"
      "addc.u32        %14, %14, %14;\n\t"

      "mul.lo.u32      %0, %15, %15;\n\t"       /* (a.d0 * a.d0).lo */
      "mad.hi.cc.u32   %1, %15, %15, %1;\n\t"   /* (a.d0 * a.d0).hi */
      "madc.lo.cc.u32  %2, %16, %16, %2;\n\t"   /* (a.d1 * a.d1).lo */
      "madc.hi.cc.u32  %3, %16, %16, %3;\n\t"   /* (a.d1 * a.d1).hi */
      "madc.lo.cc.u32  %4, %17, %17, %4;\n\t"   /* (a.d2 * a.d2).lo */
      "madc.hi.cc.u32  %5, %17, %17, %5;\n\t"   /* (a.d2 * a.d2).hi */
      "madc.lo.cc.u32  %6, %18, %18, %6;\n\t"   /* (a.d3 * a.d3).lo */
      "madc.hi.cc.u32  %7, %18, %18, %7;\n\t"   /* (a.d3 * a.d3).hi */
      "madc.lo.cc.u32  %8, %19, %19, %8;\n\t"   /* (a.d4 * a.d4).lo */
      "madc.hi.cc.u32  %9, %19, %19, %9;\n\t"   /* (a.d4 * a.d4).hi */
      "madc.lo.cc.u32  %10, %20, %20, %10;\n\t" /* (a.d5 * a.d5).lo */
      "madc.hi.cc.u32  %11, %20, %20, %11;\n\t" /* (a.d5 * a.d5).hi */
      "madc.lo.cc.u32  %12, %21, %21, %12;\n\t" /* (a.d6 * a.d6).lo */
      "madc.hi.cc.u32  %13, %21, %21, %13;\n\t" /* (a.d6 * a.d6).hi */
      "madc.lo.u32     %14, %22, %22, %14;\n\t" /* (a.d7 * a.d7).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5),
        "=r" (res->d6), "=r" (res->d7), "=r" (res->d8), "=r" (res->d9), "=r" (res->d10), "=r" (res->d11), 
        "=r" (res->d12), "=r" (res->d13), "=r" (res->d14)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7));
}


__device__ static void mul_256_512_no_low8(int256 *res, int256 a, int256 b)
/*
res ~= a * b / 2^256
Carries into res.d0 are NOT computed. So the result differs from a full mul_256_512() / 2^256.
In a full mul_256_512() there are ten possible carries from res.d5 to res.d6. So ignoring the carries
the result is 0 to 10 lower than a full mul_256_512() / 2^256.
*/
{
  asm("{\n\t"
      "mul.hi.u32      %0, %8, %23;\n\t"     /* (a.d0 * b.d7).hi */

      "mad.lo.cc.u32   %0, %9, %23, %0;\n\t" /* (a.d1 * b.d7).lo */
      "addc.u32        %1, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %9, %22, %0;\n\t" /* (a.d1 * b.d6).hi */
      "madc.hi.u32     %1, %9, %23, %1;\n\t" /* (a.d1 * b.d7).hi */

      "mad.lo.cc.u32   %0, %10, %22, %0;\n\t" /* (a.d2 * b.d6).lo */
      "madc.lo.cc.u32  %1, %10, %23, %1;\n\t" /* (a.d2 * b.d7).lo */
      "addc.u32        %2, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %10, %21, %0;\n\t" /* (a.d2 * b.d5).hi */
      "madc.hi.cc.u32  %1, %10, %22, %1;\n\t" /* (a.d2 * b.d6).hi */
      "madc.hi.u32     %2, %10, %23, %2;\n\t" /* (a.d2 * b.d7).hi */

      "mad.lo.cc.u32   %0, %11, %21, %0;\n\t" /* (a.d3 * b.d5).lo */
      "madc.lo.cc.u32  %1, %11, %22, %1;\n\t" /* (a.d3 * b.d6).lo */
      "madc.lo.cc.u32  %2, %11, %23, %2;\n\t" /* (a.d3 * b.d7).lo */
      "addc.u32        %3, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %11, %20, %0;\n\t" /* (a.d3 * b.d4).hi */
      "madc.hi.cc.u32  %1, %11, %21, %1;\n\t" /* (a.d3 * b.d5).hi */
      "madc.hi.cc.u32  %2, %11, %22, %2;\n\t" /* (a.d3 * b.d6).hi */
      "madc.hi.u32     %3, %11, %23, %3;\n\t" /* (a.d3 * b.d7).hi */

      "mad.lo.cc.u32   %0, %12, %20, %0;\n\t" /* (a.d4 * b.d4).lo */
      "madc.lo.cc.u32  %1, %12, %21, %1;\n\t" /* (a.d4 * b.d5).lo */
      "madc.lo.cc.u32  %2, %12, %22, %2;\n\t" /* (a.d4 * b.d6).lo */
      "madc.lo.cc.u32  %3, %12, %23, %3;\n\t" /* (a.d4 * b.d7).lo */
      "addc.u32        %4, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %12, %19, %0;\n\t" /* (a.d4 * b.d3).hi */
      "madc.hi.cc.u32  %1, %12, %20, %1;\n\t" /* (a.d4 * b.d4).hi */
      "madc.hi.cc.u32  %2, %12, %21, %2;\n\t" /* (a.d4 * b.d5).hi */
      "madc.hi.cc.u32  %3, %12, %22, %3;\n\t" /* (a.d4 * b.d6).hi */
      "madc.hi.u32     %4, %12, %23, %4;\n\t" /* (a.d4 * b.d7).hi */

      "mad.lo.cc.u32   %0, %13, %19, %0;\n\t" /* (a.d5 * b.d3).lo */
      "madc.lo.cc.u32  %1, %13, %20, %1;\n\t" /* (a.d5 * b.d4).lo */
      "madc.lo.cc.u32  %2, %13, %21, %2;\n\t" /* (a.d5 * b.d5).lo */
      "madc.lo.cc.u32  %3, %13, %22, %3;\n\t" /* (a.d5 * b.d6).lo */
      "madc.lo.cc.u32  %4, %13, %23, %4;\n\t" /* (a.d5 * b.d7).lo */
      "addc.u32        %5, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %13, %18, %0;\n\t" /* (a.d5 * b.d2).hi */
      "madc.hi.cc.u32  %1, %13, %19, %1;\n\t" /* (a.d5 * b.d3).hi */
      "madc.hi.cc.u32  %2, %13, %20, %2;\n\t" /* (a.d5 * b.d4).hi */
      "madc.hi.cc.u32  %3, %13, %21, %3;\n\t" /* (a.d5 * b.d5).hi */
      "madc.hi.cc.u32  %4, %13, %22, %4;\n\t" /* (a.d5 * b.d6).hi */
      "madc.hi.u32     %5, %13, %23, %5;\n\t" /* (a.d5 * b.d7).hi */

      "mad.lo.cc.u32   %0, %14, %18, %0;\n\t" /* (a.d6 * b.d2).lo */
      "madc.lo.cc.u32  %1, %14, %19, %1;\n\t" /* (a.d6 * b.d3).lo */
      "madc.lo.cc.u32  %2, %14, %20, %2;\n\t" /* (a.d6 * b.d4).lo */
      "madc.lo.cc.u32  %3, %14, %21, %3;\n\t" /* (a.d6 * b.d5).lo */
      "madc.lo.cc.u32  %4, %14, %22, %4;\n\t" /* (a.d6 * b.d6).lo */
      "madc.lo.cc.u32  %5, %14, %23, %5;\n\t" /* (a.d6 * b.d7).lo */
      "addc.u32        %6, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %14, %17, %0;\n\t" /* (a.d6 * b.d1).hi */
      "madc.hi.cc.u32  %1, %14, %18, %1;\n\t" /* (a.d6 * b.d2).hi */
      "madc.hi.cc.u32  %2, %14, %19, %2;\n\t" /* (a.d6 * b.d3).hi */
      "madc.hi.cc.u32  %3, %14, %20, %3;\n\t" /* (a.d6 * b.d4).hi */
      "madc.hi.cc.u32  %4, %14, %21, %4;\n\t" /* (a.d6 * b.d5).hi */
      "madc.hi.cc.u32  %5, %14, %22, %5;\n\t" /* (a.d6 * b.d6).hi */
      "madc.hi.u32     %6, %14, %23, %6;\n\t" /* (a.d6 * b.d7).hi */

      "mad.lo.cc.u32   %0, %15, %17, %0;\n\t" /* (a.d7 * b.d1).lo */
      "madc.lo.cc.u32  %1, %15, %18, %1;\n\t" /* (a.d7 * b.d2).lo */
      "madc.lo.cc.u32  %2, %15, %19, %2;\n\t" /* (a.d7 * b.d3).lo */
      "madc.lo.cc.u32  %3, %15, %20, %3;\n\t" /* (a.d7 * b.d4).lo */
      "madc.lo.cc.u32  %4, %15, %21, %4;\n\t" /* (a.d7 * b.d5).lo */
      "madc.lo.cc.u32  %5, %15, %22, %5;\n\t" /* (a.d7 * b.d6).lo */
      "madc.lo.cc.u32  %6, %15, %23, %6;\n\t" /* (a.d7 * b.d7).lo */
      "addc.u32        %7, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %15, %16, %0;\n\t" /* (a.d7 * b.d0).hi */
      "madc.hi.cc.u32  %1, %15, %17, %1;\n\t" /* (a.d7 * b.d1).hi */
      "madc.hi.cc.u32  %2, %15, %18, %2;\n\t" /* (a.d7 * b.d2).hi */
      "madc.hi.cc.u32  %3, %15, %19, %3;\n\t" /* (a.d7 * b.d3).hi */
      "madc.hi.cc.u32  %4, %15, %20, %4;\n\t" /* (a.d7 * b.d4).hi */
      "madc.hi.cc.u32  %5, %15, %21, %5;\n\t" /* (a.d7 * b.d5).hi */
      "madc.hi.cc.u32  %6, %15, %22, %6;\n\t" /* (a.d7 * b.d6).hi */
      "madc.hi.u32     %7, %15, %23, %7;\n\t" /* (a.d7 * b.d7).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6), "r" (b.d7));
}


__device__ static void mul_256(int256 *res, int256 a, int256 b)
/* res = a * b (only lower 256 bits of the result) */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %8, %16;\n\t"     /* (a.d0 * b.d0).lo */
      "mul.lo.u32      %1, %8, %17;\n\t"     /* (a.d0 * b.d1).lo */
      "mul.lo.u32      %2, %8, %18;\n\t"     /* (a.d0 * b.d2).lo */
      "mul.lo.u32      %3, %8, %19;\n\t"     /* (a.d0 * b.d3).lo */
      "mul.lo.u32      %4, %8, %20;\n\t"     /* (a.d0 * b.d4).lo */
      "mul.lo.u32      %5, %8, %21;\n\t"     /* (a.d0 * b.d5).lo */
      "mul.lo.u32      %6, %8, %22;\n\t"     /* (a.d0 * b.d6).lo */
      "mul.lo.u32      %7, %8, %23;\n\t"     /* (a.d0 * b.d7).lo */

      "mad.hi.cc.u32   %1, %8, %16, %1;\n\t" /* (a.d0 * b.d0).hi */
      "madc.hi.cc.u32  %2, %8, %17, %2;\n\t" /* (a.d0 * b.d1).hi */
      "madc.hi.cc.u32  %3, %8, %18, %3;\n\t" /* (a.d0 * b.d2).hi */
      "madc.hi.cc.u32  %4, %8, %19, %4;\n\t" /* (a.d0 * b.d3).hi */
      "madc.hi.cc.u32  %5, %8, %20, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.cc.u32  %6, %8, %21, %6;\n\t" /* (a.d0 * b.d5).hi */
      "madc.hi.u32     %7, %8, %22, %7;\n\t" /* (a.d0 * b.d6).hi */

      "mad.lo.cc.u32   %1, %9, %16, %1;\n\t" /* (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %9, %17, %2;\n\t" /* (a.d1 * b.d1).lo */
      "madc.lo.cc.u32  %3, %9, %18, %3;\n\t" /* (a.d1 * b.d2).lo */
      "madc.lo.cc.u32  %4, %9, %19, %4;\n\t" /* (a.d1 * b.d3).lo */
      "madc.lo.cc.u32  %5, %9, %20, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.cc.u32  %6, %9, %21, %6;\n\t" /* (a.d1 * b.d5).lo */
      "madc.lo.u32     %7, %9, %22, %7;\n\t" /* (a.d1 * b.d6).lo */

      "mad.hi.cc.u32   %2, %9, %16, %2;\n\t" /* (a.d1 * b.d0).hi */
      "madc.hi.cc.u32  %3, %9, %17, %3;\n\t" /* (a.d1 * b.d1).hi */
      "madc.hi.cc.u32  %4, %9, %18, %4;\n\t" /* (a.d1 * b.d2).hi */
      "madc.hi.cc.u32  %5, %9, %19, %5;\n\t" /* (a.d1 * b.d3).hi */
      "madc.hi.cc.u32  %6, %9, %20, %6;\n\t" /* (a.d1 * b.d4).hi */
      "madc.hi.u32     %7, %9, %21, %7;\n\t" /* (a.d1 * b.d5).hi */

      "mad.lo.cc.u32   %2, %10, %16, %2;\n\t" /* (a.d2 * b.d0).lo */
      "madc.lo.cc.u32  %3, %10, %17, %3;\n\t" /* (a.d2 * b.d1).lo */
      "madc.lo.cc.u32  %4, %10, %18, %4;\n\t" /* (a.d2 * b.d2).lo */
      "madc.lo.cc.u32  %5, %10, %19, %5;\n\t" /* (a.d2 * b.d3).lo */
      "madc.lo.cc.u32  %6, %10, %20, %6;\n\t" /* (a.d2 * b.d4).lo */
      "madc.lo.u32     %7, %10, %21, %7;\n\t" /* (a.d2 * b.d5).lo */

      "mad.hi.cc.u32   %3, %10, %16, %3;\n\t" /* (a.d2 * b.d0).hi */
      "madc.hi.cc.u32  %4, %10, %17, %4;\n\t" /* (a.d2 * b.d1).hi */
      "madc.hi.cc.u32  %5, %10, %18, %5;\n\t" /* (a.d2 * b.d2).hi */
      "madc.hi.cc.u32  %6, %10, %19, %6;\n\t" /* (a.d2 * b.d3).hi */
      "madc.hi.u32     %7, %10, %20, %7;\n\t" /* (a.d2 * b.d4).hi */

      "mad.lo.cc.u32   %3, %11, %16, %3;\n\t" /* (a.d3 * b.d0).lo */
      "madc.lo.cc.u32  %4, %11, %17, %4;\n\t" /* (a.d3 * b.d1).lo */
      "madc.lo.cc.u32  %5, %11, %18, %5;\n\t" /* (a.d3 * b.d2).lo */
      "madc.lo.cc.u32  %6, %11, %19, %6;\n\t" /* (a.d3 * b.d3).lo */
      "madc.lo.u32     %7, %11, %20, %7;\n\t" /* (a.d3 * b.d4).lo */

      "mad.hi.cc.u32   %4, %11, %16, %4;\n\t" /* (a.d3 * b.d0).hi */
      "madc.hi.cc.u32  %5, %11, %17, %5;\n\t" /* (a.d3 * b.d1).hi */
      "madc.hi.cc.u32  %6, %11, %18, %6;\n\t" /* (a.d3 * b.d2).hi */
      "madc.hi.u32     %7, %11, %19, %7;\n\t" /* (a.d3 * b.d3).hi */

      "mad.lo.cc.u32   %4, %12, %16, %4;\n\t" /* (a.d4 * b.d0).lo */
      "madc.lo.cc.u32  %5, %12, %17, %5;\n\t" /* (a.d4 * b.d1).lo */
      "madc.lo.cc.u32  %6, %12, %18, %6;\n\t" /* (a.d4 * b.d2).lo */
      "madc.lo.u32     %7, %12, %19, %7;\n\t" /* (a.d4 * b.d3).lo */

      "mad.hi.cc.u32   %5, %12, %16, %5;\n\t" /* (a.d4 * b.d0).hi */
      "madc.hi.cc.u32  %6, %12, %17, %6;\n\t" /* (a.d4 * b.d1).hi */
      "madc.hi.u32     %7, %12, %18, %7;\n\t" /* (a.d4 * b.d2).hi */

      "mad.lo.cc.u32   %5, %13, %16, %5;\n\t" /* (a.d5 * b.d0).lo */
      "madc.lo.cc.u32  %6, %13, %17, %6;\n\t" /* (a.d5 * b.d1).lo */
      "madc.lo.u32     %7, %13, %18, %7;\n\t" /* (a.d5 * b.d2).lo */

      "mad.hi.cc.u32   %6, %13, %16, %6;\n\t" /* (a.d5 * b.d0).hi */
      "madc.hi.u32     %7, %13, %17, %7;\n\t" /* (a.d5 * b.d1).hi */

      "mad.lo.cc.u32   %6, %14, %16, %6;\n\t" /* (a.d6 * b.d0).lo */
      "madc.lo.u32     %7, %14, %17, %7;\n\t" /* (a.d6 * b.d1).lo */

      "mad.hi.u32      %7, %14, %16, %7;\n\t" /* (a.d6 * b.d0).hi */

      "mad.lo.u32      %7, %15, %16, %7;\n\t" /* (a.d7 * b.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6), "r" (b.d7));
}


__device__ static void mul_256_224(int256 *res, int256 a, int224 b)
/* res = a * b (only lower 256 bits of the result) */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %8, %16;\n\t"     /* (a.d0 * b.d0).lo */
      "mul.lo.u32      %1, %8, %17;\n\t"     /* (a.d0 * b.d1).lo */
      "mul.lo.u32      %2, %8, %18;\n\t"     /* (a.d0 * b.d2).lo */
      "mul.lo.u32      %3, %8, %19;\n\t"     /* (a.d0 * b.d3).lo */
      "mul.lo.u32      %4, %8, %20;\n\t"     /* (a.d0 * b.d4).lo */
      "mul.lo.u32      %5, %8, %21;\n\t"     /* (a.d0 * b.d5).lo */
      "mul.lo.u32      %6, %8, %22;\n\t"     /* (a.d0 * b.d6).lo */
      "mul.lo.u32      %7, %15,%16;\n\t"     /* (a.d7 * b.d0).lo */

      "mad.hi.cc.u32   %1, %8, %16, %1;\n\t" /* (a.d0 * b.d0).hi */
      "madc.hi.cc.u32  %2, %8, %17, %2;\n\t" /* (a.d0 * b.d1).hi */
      "madc.hi.cc.u32  %3, %8, %18, %3;\n\t" /* (a.d0 * b.d2).hi */
      "madc.hi.cc.u32  %4, %8, %19, %4;\n\t" /* (a.d0 * b.d3).hi */
      "madc.hi.cc.u32  %5, %8, %20, %5;\n\t" /* (a.d0 * b.d4).hi */
      "madc.hi.cc.u32  %6, %8, %21, %6;\n\t" /* (a.d0 * b.d5).hi */
      "madc.hi.u32     %7, %8, %22, %7;\n\t" /* (a.d0 * b.d6).hi */

      "mad.lo.cc.u32   %1, %9, %16, %1;\n\t" /* (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %9, %17, %2;\n\t" /* (a.d1 * b.d1).lo */
      "madc.lo.cc.u32  %3, %9, %18, %3;\n\t" /* (a.d1 * b.d2).lo */
      "madc.lo.cc.u32  %4, %9, %19, %4;\n\t" /* (a.d1 * b.d3).lo */
      "madc.lo.cc.u32  %5, %9, %20, %5;\n\t" /* (a.d1 * b.d4).lo */
      "madc.lo.cc.u32  %6, %9, %21, %6;\n\t" /* (a.d1 * b.d5).lo */
      "madc.lo.u32     %7, %9, %22, %7;\n\t" /* (a.d1 * b.d6).lo */

      "mad.hi.cc.u32   %2, %9, %16, %2;\n\t" /* (a.d1 * b.d0).hi */
      "madc.hi.cc.u32  %3, %9, %17, %3;\n\t" /* (a.d1 * b.d1).hi */
      "madc.hi.cc.u32  %4, %9, %18, %4;\n\t" /* (a.d1 * b.d2).hi */
      "madc.hi.cc.u32  %5, %9, %19, %5;\n\t" /* (a.d1 * b.d3).hi */
      "madc.hi.cc.u32  %6, %9, %20, %6;\n\t" /* (a.d1 * b.d4).hi */
      "madc.hi.u32     %7, %9, %21, %7;\n\t" /* (a.d1 * b.d5).hi */

      "mad.lo.cc.u32   %2, %10, %16, %2;\n\t" /* (a.d2 * b.d0).lo */
      "madc.lo.cc.u32  %3, %10, %17, %3;\n\t" /* (a.d2 * b.d1).lo */
      "madc.lo.cc.u32  %4, %10, %18, %4;\n\t" /* (a.d2 * b.d2).lo */
      "madc.lo.cc.u32  %5, %10, %19, %5;\n\t" /* (a.d2 * b.d3).lo */
      "madc.lo.cc.u32  %6, %10, %20, %6;\n\t" /* (a.d2 * b.d4).lo */
      "madc.lo.u32     %7, %10, %21, %7;\n\t" /* (a.d2 * b.d5).lo */

      "mad.hi.cc.u32   %3, %10, %16, %3;\n\t" /* (a.d2 * b.d0).hi */
      "madc.hi.cc.u32  %4, %10, %17, %4;\n\t" /* (a.d2 * b.d1).hi */
      "madc.hi.cc.u32  %5, %10, %18, %5;\n\t" /* (a.d2 * b.d2).hi */
      "madc.hi.cc.u32  %6, %10, %19, %6;\n\t" /* (a.d2 * b.d3).hi */
      "madc.hi.u32     %7, %10, %20, %7;\n\t" /* (a.d2 * b.d4).hi */

      "mad.lo.cc.u32   %3, %11, %16, %3;\n\t" /* (a.d3 * b.d0).lo */
      "madc.lo.cc.u32  %4, %11, %17, %4;\n\t" /* (a.d3 * b.d1).lo */
      "madc.lo.cc.u32  %5, %11, %18, %5;\n\t" /* (a.d3 * b.d2).lo */
      "madc.lo.cc.u32  %6, %11, %19, %6;\n\t" /* (a.d3 * b.d3).lo */
      "madc.lo.u32     %7, %11, %20, %7;\n\t" /* (a.d3 * b.d4).lo */

      "mad.hi.cc.u32   %4, %11, %16, %4;\n\t" /* (a.d3 * b.d0).hi */
      "madc.hi.cc.u32  %5, %11, %17, %5;\n\t" /* (a.d3 * b.d1).hi */
      "madc.hi.cc.u32  %6, %11, %18, %6;\n\t" /* (a.d3 * b.d2).hi */
      "madc.hi.u32     %7, %11, %19, %7;\n\t" /* (a.d3 * b.d3).hi */

      "mad.lo.cc.u32   %4, %12, %16, %4;\n\t" /* (a.d4 * b.d0).lo */
      "madc.lo.cc.u32  %5, %12, %17, %5;\n\t" /* (a.d4 * b.d1).lo */
      "madc.lo.cc.u32  %6, %12, %18, %6;\n\t" /* (a.d4 * b.d2).lo */
      "madc.lo.u32     %7, %12, %19, %7;\n\t" /* (a.d4 * b.d3).lo */

      "mad.hi.cc.u32   %5, %12, %16, %5;\n\t" /* (a.d4 * b.d0).hi */
      "madc.hi.cc.u32  %6, %12, %17, %6;\n\t" /* (a.d4 * b.d1).hi */
      "madc.hi.u32     %7, %12, %18, %7;\n\t" /* (a.d4 * b.d2).hi */

      "mad.lo.cc.u32   %5, %13, %16, %5;\n\t" /* (a.d5 * b.d0).lo */
      "madc.lo.cc.u32  %6, %13, %17, %6;\n\t" /* (a.d5 * b.d1).lo */
      "madc.lo.u32     %7, %13, %18, %7;\n\t" /* (a.d5 * b.d2).lo */

      "mad.hi.cc.u32   %6, %13, %16, %6;\n\t" /* (a.d5 * b.d0).hi */
      "madc.hi.u32     %7, %13, %17, %7;\n\t" /* (a.d5 * b.d1).hi */

      "mad.lo.cc.u32   %6, %14, %16, %6;\n\t" /* (a.d6 * b.d0).lo */
      "madc.lo.u32     %7, %14, %17, %7;\n\t" /* (a.d6 * b.d1).lo */

      "mad.hi.u32      %7, %14, %16, %7;\n\t" /* (a.d6 * b.d0).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3), "r" (b.d4), "r" (b.d5), "r" (b.d6));
}

#if 0
// not enough registers! Used only in MM part of the code

__device__ static void mulsub_256_224(int256 *res, int480 c, int256 a, int224 negb)
/* res = c - a * b (only lower 256 bits of the result), negb.d6 is 0xFFFFFFFF */
{
  asm("{\n\t"
      "mad.lo.cc.u32   %0, %8, %16, %23;\n\t"  /* c += (a.d0 * negb.d0).lo */
      "madc.lo.cc.u32  %1, %8, %17, %24;\n\t"  /* c += (a.d0 * negb.d1).lo */
      "madc.lo.cc.u32  %2, %8, %18, %25;\n\t"  /* c += (a.d0 * negb.d2).lo */
      "madc.lo.cc.u32  %3, %8, %19, %26;\n\t"  /* c += (a.d0 * negb.d3).lo */
      "madc.lo.cc.u32  %4, %8, %20, %27;\n\t"  /* c += (a.d0 * negb.d4).lo */
      "madc.lo.cc.u32  %5, %8, %21, %28;\n\t"  /* c += (a.d0 * negb.d5).lo */
      "madc.lo.cc.u32  %6, %8, %22, %29;\n\t"  /* c += (a.d0 * negb.d6).lo */
      "madc.lo.u32     %7, %15, %16, %30;\n\t" /* c += (a.d7 * negb.d0).lo */

      "mad.hi.cc.u32   %1, %8, %16, %1;\n\t"   /* c += (a.d0 * negb.d0).hi */
      "madc.hi.cc.u32  %2, %8, %17, %2;\n\t"   /* c += (a.d0 * negb.d1).hi */
      "madc.hi.cc.u32  %3, %8, %18, %3;\n\t"   /* c += (a.d0 * negb.d2).hi */
      "madc.hi.cc.u32  %4, %8, %19, %4;\n\t"   /* c += (a.d0 * negb.d3).hi */
      "madc.hi.cc.u32  %5, %8, %20, %5;\n\t"   /* c += (a.d0 * negb.d4).hi */
      "madc.hi.cc.u32  %6, %8, %21, %6;\n\t"   /* c += (a.d0 * negb.d5).hi */
      "madc.hi.u32     %7, %8, %22, %7;\n\t"   /* c += (a.d0 * negb.d6).hi */

      "mad.lo.cc.u32   %1, %9, %16, %1;\n\t"   /* c += (a.d1 * negb.d0).lo */
      "madc.lo.cc.u32  %2, %9, %17, %2;\n\t"   /* c += (a.d1 * negb.d1).lo */
      "madc.lo.cc.u32  %3, %9, %18, %3;\n\t"   /* c += (a.d1 * negb.d2).lo */
      "madc.lo.cc.u32  %4, %9, %19, %4;\n\t"   /* c += (a.d1 * negb.d3).lo */
      "madc.lo.cc.u32  %5, %9, %20, %5;\n\t"   /* c += (a.d1 * negb.d4).lo */
      "madc.lo.cc.u32  %6, %9, %21, %6;\n\t"   /* c += (a.d1 * negb.d5).lo */
      "madc.lo.u32     %7, %9, %22, %7;\n\t"   /* c += (a.d1 * negb.d6).lo */

      "mad.hi.cc.u32   %2, %9, %16, %2;\n\t"   /* c += (a.d1 * negb.d0).hi */
      "madc.hi.cc.u32  %3, %9, %17, %3;\n\t"   /* c += (a.d1 * negb.d1).hi */
      "madc.hi.cc.u32  %4, %9, %18, %4;\n\t"   /* c += (a.d1 * negb.d2).hi */
      "madc.hi.cc.u32  %5, %9, %19, %5;\n\t"   /* c += (a.d1 * negb.d3).hi */
      "madc.hi.cc.u32  %6, %9, %20, %6;\n\t"   /* c += (a.d1 * negb.d4).hi */
      "madc.hi.u32     %7, %9, %21, %7;\n\t"   /* c += (a.d1 * negb.d5).hi */

      "mad.lo.cc.u32   %2, %10, %16, %2;\n\t"   /* c += (a.d2 * negb.d0).lo */
      "madc.lo.cc.u32  %3, %10, %17, %3;\n\t"   /* c += (a.d2 * negb.d1).lo */
      "madc.lo.cc.u32  %4, %10, %18, %4;\n\t"   /* c += (a.d2 * negb.d2).lo */
      "madc.lo.cc.u32  %5, %10, %19, %5;\n\t"   /* c += (a.d2 * negb.d3).lo */
      "madc.lo.cc.u32  %6, %10, %20, %6;\n\t"   /* c += (a.d2 * negb.d4).lo */
      "madc.lo.u32     %7, %10, %21, %7;\n\t"   /* c += (a.d2 * negb.d5).lo */

      "mad.hi.cc.u32   %3, %10, %16, %3;\n\t"   /* c += (a.d2 * negb.d0).hi */
      "madc.hi.cc.u32  %4, %10, %17, %4;\n\t"   /* c += (a.d2 * negb.d1).hi */
      "madc.hi.cc.u32  %5, %10, %18, %5;\n\t"   /* c += (a.d2 * negb.d2).hi */
      "madc.hi.cc.u32  %6, %10, %19, %6;\n\t"   /* c += (a.d2 * negb.d3).hi */
      "madc.hi.u32     %7, %10, %20, %7;\n\t"   /* c += (a.d2 * negb.d4).hi */

      "mad.lo.cc.u32   %3, %11, %16, %3;\n\t"   /* c += (a.d3 * negb.d0).lo */
      "madc.lo.cc.u32  %4, %11, %17, %4;\n\t"   /* c += (a.d3 * negb.d1).lo */
      "madc.lo.cc.u32  %5, %11, %18, %5;\n\t"   /* c += (a.d3 * negb.d2).lo */
      "madc.lo.cc.u32  %6, %11, %19, %6;\n\t"   /* c += (a.d3 * negb.d3).lo */
      "madc.lo.u32     %7, %11, %20, %7;\n\t"   /* c += (a.d3 * negb.d4).lo */

      "mad.hi.cc.u32   %4, %11, %16, %4;\n\t"   /* c += (a.d3 * negb.d0).hi */
      "madc.hi.cc.u32  %5, %11, %17, %5;\n\t"   /* c += (a.d3 * negb.d1).hi */
      "madc.hi.cc.u32  %6, %11, %18, %6;\n\t"   /* c += (a.d3 * negb.d2).hi */
      "madc.hi.u32     %7, %11, %19, %7;\n\t"   /* c += (a.d3 * negb.d3).hi */

      "mad.lo.cc.u32   %4, %12, %16, %4;\n\t"   /* c += (a.d4 * negb.d0).lo */
      "madc.lo.cc.u32  %5, %12, %17, %5;\n\t"   /* c += (a.d4 * negb.d1).lo */
      "madc.lo.cc.u32  %6, %12, %18, %6;\n\t"   /* c += (a.d4 * negb.d2).lo */
      "madc.lo.u32     %7, %12, %19, %7;\n\t"   /* c += (a.d4 * negb.d3).lo */

      "mad.hi.cc.u32   %5, %12, %16, %5;\n\t"   /* c += (a.d4 * negb.d0).hi */
      "madc.hi.cc.u32  %6, %12, %17, %6;\n\t"   /* c += (a.d4 * negb.d1).hi */
      "madc.hi.u32     %7, %12, %18, %7;\n\t"   /* c += (a.d4 * negb.d2).hi */

      "mad.lo.cc.u32   %5, %13, %16, %5;\n\t"   /* c += (a.d5 * negb.d0).lo */
      "madc.lo.cc.u32  %6, %13, %17, %6;\n\t"   /* c += (a.d5 * negb.d1).lo */
      "madc.lo.u32     %7, %13, %18, %7;\n\t"   /* c += (a.d5 * negb.d2).lo */

      "mad.hi.cc.u32   %6, %13, %16, %6;\n\t"   /* c += (a.d5 * negb.d0).hi */
      "madc.hi.u32     %7, %13, %17, %7;\n\t"   /* c += (a.d5 * negb.d1).hi */

      "mad.lo.cc.u32   %6, %14, %16, %6;\n\t"  /* c += (a.d6 * negb.d0).lo */
      "madc.lo.u32     %7, %14, %17, %7;\n\t"  /* c += (a.d6 * negb.d1).lo */

      "mad.hi.u32      %7, %14, %16, %7;\n\t"  /* c += (a.d6 * negb.d0).hi */

      "sub.u32         %7, %7, %8;\n\t"        /* c += (a.d0 * negb.d7).lo (d6=-1) */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5), "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4), "r" (a.d5), "r" (a.d6), "r" (a.d7),
        "r" (negb.d0), "r" (negb.d1), "r" (negb.d2), "r" (negb.d3), "r" (negb.d4), "r" (negb.d5), "r" (negb.d6),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3), "r" (c.d4), "r" (c.d5), "r" (c.d6), "r" (c.d7));
}
#endif


__device__ static void div_512_256(int256 *res, int512 q, int256 n, float nf)
/* res = q / n (integer division) */
{
  float qf;
  unsigned int qi;
  int512 nn;
  int256 tmp256;

/********** Step X, Offset 2^235 (7*32 + 11) **********/
  qf= __uint2float_rn(q.d15);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d14);
  qf*= 2097152.0f;

  qi=__float2uint_rz(qf*nf);

  res->d7 = (qi <<= 11);

// nn = n * qi
  nn.d7 =                                 __umul32(n.d0, qi);
  nn.d8 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d10= __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d11= __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d12= __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d13= __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d14= __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d15= __addc   (__umul32hi(n.d7, qi),                  0);

//  q = q - nn
  q.d7 = __sub_cc (q.d7 , nn.d7 );
  q.d8 = __subc_cc(q.d8 , nn.d8 );
  q.d9 = __subc_cc(q.d9 , nn.d9 );
  q.d10= __subc_cc(q.d10, nn.d10);
  q.d11= __subc_cc(q.d11, nn.d11);
  q.d12= __subc_cc(q.d12, nn.d12);
  q.d13= __subc_cc(q.d13, nn.d13);
  q.d14= __subc_cc(q.d14, nn.d14);
  q.d15= __subc   (q.d15, nn.d15);

/********** Step Y, Offset 2^215 (6*32 + 23) **********/
  qf= __uint2float_rn(q.d15);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d14);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d13);
  qf*= 512.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("qY: %X\n", qi);

  res->d6 =  qi << 23;
  res->d7 += qi >>  9;

// nn = n * qi
  nn.d6 =                                 __umul32(n.d0, qi);
  nn.d7 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d10= __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d11= __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d12= __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d13= __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d14= __addc_cc(__umul32hi(n.d7, qi),                  0);

// shiftleft nn 23 bits
  nn.d14 = (nn.d14 << 23) + (nn.d13 >> 9);
  nn.d13 = (nn.d13 << 23) + (nn.d12 >> 9);
  nn.d12 = (nn.d12 << 23) + (nn.d11 >> 9);
  nn.d11 = (nn.d11 << 23) + (nn.d10 >> 9);
  nn.d10 = (nn.d10 << 23) + (nn.d9  >> 9);
  nn.d9  = (nn.d9  << 23) + (nn.d8  >> 9);
  nn.d8  = (nn.d8  << 23) + (nn.d7  >> 9);
  nn.d7  = (nn.d7  << 23) + (nn.d6  >> 9);
  nn.d6  =  nn.d6  << 23;

// q = q - nn
  q.d6 = __sub_cc (q.d6 , nn.d6 );
  q.d7 = __subc_cc(q.d7 , nn.d7 );
  q.d8 = __subc_cc(q.d8 , nn.d8 );
  q.d9 = __subc_cc(q.d9 , nn.d9 );
  q.d10= __subc_cc(q.d10, nn.d10);
  q.d11= __subc_cc(q.d11, nn.d11);
  q.d12= __subc_cc(q.d12, nn.d12);
  q.d13= __subc_cc(q.d13, nn.d13);
  q.d14= __subc   (q.d14, nn.d14);
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("qYq: %X %X\n", q.d13, q.d12);

/********** Step Z, Offset 2^195 (6*32 + 3) **********/
  qf= __uint2float_rn(q.d14);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d13);
  qf*= 536870912.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("qZ: %X\n", qi);

  qi <<= 3;
  res->d6 = __add_cc(res->d6, qi);
  res->d7 = __addc  (res->d7, 0);

// nn = n * qi
  nn.d6 =                                 __umul32(n.d0, qi);
  nn.d7 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d10= __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d11= __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d12= __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d13= __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d14= __addc_cc(__umul32hi(n.d7, qi),                  0);

// q = q - nn
  q.d6 = __sub_cc (q.d6 , nn.d6 );
  q.d7 = __subc_cc(q.d7 , nn.d7 );
  q.d8 = __subc_cc(q.d8 , nn.d8 );
  q.d9 = __subc_cc(q.d9 , nn.d9 );
  q.d10= __subc_cc(q.d10, nn.d10);
  q.d11= __subc_cc(q.d11, nn.d11);
  q.d12= __subc_cc(q.d12, nn.d12);
  q.d13= __subc_cc(q.d13, nn.d13);
  q.d14= __subc   (q.d14, nn.d14);
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("qZq: %X %X\n", q.d13, q.d12);

/********** Step 1, Offset 2^175 (5*32 + 15) **********/
  qf= __uint2float_rn(q.d14);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d13);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d12);
  qf*= 131072.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) if ((qi >> 17) != 0) printf ("1/f fail 1\n");

  res->d5 = qi << 15;
  res->d6 = __add_cc(res->d6, qi >> 17);
  res->d7 = __addc  (res->d7, 0);

// nn = n * qi
  nn.d5 =                                 __umul32(n.d0, qi);
  nn.d6 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d10= __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d11= __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d12= __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d13= __addc_cc(__umul32hi(n.d7, qi),                  0);

//if (nn.d13 >> 17 != q.d14) printf ("1/f fail 1a\n");
// shiftleft nn 15 bits
//nn.d14=                 (nn.d13>> 17);
  nn.d13= (nn.d13<< 15) + (nn.d12>> 17);
  nn.d12= (nn.d12<< 15) + (nn.d11>> 17);
  nn.d11= (nn.d11<< 15) + (nn.d10>> 17);
  nn.d10= (nn.d10<< 15) + (nn.d9 >> 17);
  nn.d9 = (nn.d9 << 15) + (nn.d8 >> 17);
  nn.d8 = (nn.d8 << 15) + (nn.d7 >> 17);
  nn.d7 = (nn.d7 << 15) + (nn.d6 >> 17);
  nn.d6 = (nn.d6 << 15) + (nn.d5 >> 17);
  nn.d5 =  nn.d5 << 15;
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("q1nn: %X %X %X\n", nn.d12, nn.d11, nn.d10);

// q = q - nn
  q.d5 = __sub_cc (q.d5 , nn.d5 );
  q.d6 = __subc_cc(q.d6 , nn.d6 );
  q.d7 = __subc_cc(q.d7 , nn.d7 );
  q.d8 = __subc_cc(q.d8 , nn.d8 );
  q.d9 = __subc_cc(q.d9 , nn.d9 );
  q.d10= __subc_cc(q.d10, nn.d10);
  q.d11= __subc_cc(q.d11, nn.d11);
  q.d12= __subc_cc(q.d12, nn.d12);
  q.d13= __subc   (q.d13, nn.d13);

/********** Step 2, Offset 2^155 (4*32 + 27) **********/
  qf= __uint2float_rn(q.d13);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d12);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d11);
  qf*= 32.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 0) printf ("q2: %X\n", qi);

  res->d4 = qi<<27;
  res->d5 = __add_cc(res->d5, qi>>5);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d4  =                                 __umul32(n.d0, qi);
  nn.d5  = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d6  = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d7  = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d8  = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d9  = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d10 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d11 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d12 = __addc   (__umul32hi(n.d7, qi),                  0);

//if (blockIdx.x==12 && threadIdx.x == 4) if (nn.d12 >> 5 != q.d13) printf ("1/f fail 2\n");
// shiftleft nn 27 bits
//nn.d13 =                  (nn.d12 >> 5);
  nn.d12 = (nn.d12 << 27) + (nn.d11 >> 5);
  nn.d11 = (nn.d11 << 27) + (nn.d10 >> 5);
  nn.d10 = (nn.d10 << 27) + (nn.d9  >> 5);
  nn.d9  = (nn.d9  << 27) + (nn.d8  >> 5);
  nn.d8  = (nn.d8  << 27) + (nn.d7  >> 5);
  nn.d7  = (nn.d7  << 27) + (nn.d6  >> 5);
  nn.d6  = (nn.d6  << 27) + (nn.d5  >> 5);
  nn.d5  = (nn.d5  << 27) + (nn.d4  >> 5);
  nn.d4  =  nn.d4  << 27;

//  q = q - nn
  q.d4  = __sub_cc (q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc_cc(q.d7,  nn.d7);
  q.d8  = __subc_cc(q.d8,  nn.d8);
  q.d9  = __subc_cc(q.d9,  nn.d9);
  q.d10 = __subc_cc(q.d10, nn.d10);
  q.d11 = __subc_cc(q.d11, nn.d11);
  q.d12 = __subc   (q.d12, nn.d12);

/********** Step 3, Offset 2^135 (4*32 + 7) **********/
  qf= __uint2float_rn(q.d12);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d11);
  qf*= 33554432.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q3: %X\n", qi);

  qi <<= 7;
  res->d4 = __add_cc (res->d4, qi);
  res->d5 = __addc_cc(res->d5,  0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d4  =                                 __umul32(n.d0, qi);
  nn.d5  = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d6  = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d7  = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d8  = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d9  = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d10 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d11 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d12 = __addc   (__umul32hi(n.d7, qi),                  0);

//  q = q - nn
  q.d4  = __sub_cc (q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc_cc(q.d7,  nn.d7);
  q.d8  = __subc_cc(q.d8,  nn.d8);
  q.d9  = __subc_cc(q.d9,  nn.d9);
  q.d10 = __subc_cc(q.d10, nn.d10);
  q.d11 = __subc_cc(q.d11, nn.d11);
  q.d12 = __subc   (q.d12, nn.d12);

/********** Step 4, Offset 2^115 (3*32 + 19) **********/
  qf= __uint2float_rn(q.d12);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d11);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d10);
  qf*= 8192.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q4: %X\n", qi);

  res->d3 = qi << 19;
  res->d4 = __add_cc (res->d4, qi >> 13);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d3 =                                 __umul32(n.d0, qi);
  nn.d4 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d10= __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d11= __addc   (__umul32hi(n.d7, qi),                  0);

// shiftleft nn 19 bits
  nn.d11 = (nn.d11 << 19)+ (nn.d10>> 13);
  nn.d10 = (nn.d10 << 19)+ (nn.d9 >> 13);
  nn.d9  = (nn.d9 << 19) + (nn.d8 >> 13);
  nn.d8  = (nn.d8 << 19) + (nn.d7 >> 13);
  nn.d7  = (nn.d7 << 19) + (nn.d6 >> 13);
  nn.d6  = (nn.d6 << 19) + (nn.d5 >> 13);
  nn.d5  = (nn.d5 << 19) + (nn.d4 >> 13);
  nn.d4  = (nn.d4 << 19) + (nn.d3 >> 13);
  nn.d3  =  nn.d3 << 19;

//  q = q - nn
  q.d3  = __sub_cc (q.d3,  nn.d3);
  q.d4  = __subc_cc(q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc_cc(q.d7,  nn.d7);
  q.d8  = __subc_cc(q.d8,  nn.d8);
  q.d9  = __subc_cc(q.d9,  nn.d9);
  q.d10 = __subc_cc(q.d10, nn.d10);
  q.d11 = __subc   (q.d11, nn.d11);
//-- need q.d11 -- checked ~SB
//if (0 != q.d11) printf ("1/f fail 4d\n");

/********** Step 5, Offset 2^95 (2*32 + 31) **********/
  qf= __uint2float_rn(q.d11);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d10);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d9);
  qf*= 2.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q5: %X\n", qi);

  res->d2 = qi << 31;
  res->d3 = __add_cc (res->d3, qi >> 1);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d10= __addc   (__umul32hi(n.d7, qi),                  0);

//if (nn.d10 >> 1 != q.d11) printf ("1/f fail 7\n");
// shiftleft nn 31 bits
  nn.d10= (nn.d10<< 31) + (nn.d9 >> 1);
  nn.d9 = (nn.d9 << 31) + (nn.d8 >> 1);
  nn.d8 = (nn.d8 << 31) + (nn.d7 >> 1);
  nn.d7 = (nn.d7 << 31) + (nn.d6 >> 1);
  nn.d6 = (nn.d6 << 31) + (nn.d5 >> 1);
  nn.d5 = (nn.d5 << 31) + (nn.d4 >> 1);
  nn.d4 = (nn.d4 << 31) + (nn.d3 >> 1);
  nn.d3 = (nn.d3 << 31) + (nn.d2 >> 1);
  nn.d2 =  nn.d2 << 31;

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc_cc(q.d7, nn.d7);
  q.d8 = __subc_cc(q.d8, nn.d8);
  q.d9 = __subc_cc(q.d9, nn.d9);
  q.d10= __subc   (q.d10, nn.d10);

/********** Step 6, Offset 2^75 (2*32 + 11) **********/
  qf= __uint2float_rn(q.d10);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d9);
  qf*= 2097152.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q6: %X\n", qi);

  qi <<= 11;
  res->d2 = __add_cc (res->d2, qi);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d9 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d10= __addc   (__umul32hi(n.d7, qi),                  0);

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc_cc(q.d7, nn.d7);
  q.d8 = __subc_cc(q.d8, nn.d8);
  q.d9 = __subc_cc(q.d9, nn.d9);
  q.d10= __subc   (q.d10, nn.d10);

/********** Step 7, Offset 2^55 (1*32 + 23) **********/
  qf= __uint2float_rn(q.d10);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d9);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d8);
  qf*= 512.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q7: %X\n", qi);

  res->d1 = qi << 23;
  res->d2 = __add_cc (res->d2, qi >> 9);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d9 = __addc   (__umul32hi(n.d7, qi),                  0);

//if (nn.d8 >> 9 != q.d9) printf ("1/f fail 7b\n");
// shiftleft nn 23 bits
  nn.d9 = (nn.d9 << 23) + (nn.d8 >> 9);
  nn.d8 = (nn.d8 << 23) + (nn.d7 >> 9);
  nn.d7 = (nn.d7 << 23) + (nn.d6 >> 9);
  nn.d6 = (nn.d6 << 23) + (nn.d5 >> 9);
  nn.d5 = (nn.d5 << 23) + (nn.d4 >> 9);
  nn.d4 = (nn.d4 << 23) + (nn.d3 >> 9);
  nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
  nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
  nn.d1 =  nn.d1 << 23;

// q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc_cc(q.d7, nn.d7);
  q.d8 = __subc_cc(q.d8, nn.d8);
  q.d9 = __subc   (q.d9, nn.d9);

/********** Step 8, Offset 2^35 (1*32 + 3) **********/

  qf= __uint2float_rn(q.d9);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d8);
  qf*= 536870912.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q8: %X\n", qi);

  qi <<= 3;
  res->d1 = __add_cc (res->d1, qi);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d8 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d9 = __addc   (__umul32hi(n.d7, qi),                  0);

//  q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc_cc(q.d7, nn.d7);
  q.d8 = __subc_cc(q.d8, nn.d8);
  q.d9 = __subc   (q.d9, nn.d9);

/********** Step 9, Offset 2^15 (0*32 + 15) **********/

  qf= __uint2float_rn(q.d9);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d8);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d7);
  qf*= 131072.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q9: %X\n", qi);

  res->d0 = qi << 15;
  res->d1 = __add_cc (res->d1, qi >> 17);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d0 =                                 __umul32(n.d0, qi);
  nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d6, qi), __umul32(n.d7, qi));
  nn.d8 = __addc   (__umul32hi(n.d7, qi),                  0);

//if (blockIdx.x==12 && threadIdx.x == 4) if (nn.d6 >> 17 != q.d7) printf ("1/f fail 9\n");

// shiftleft nn 15 bits
//nn.d9 =                  nn.d8 >> 17;
  nn.d8 = (nn.d8 << 15) + (nn.d7 >> 17);
  nn.d7 = (nn.d7 << 15) + (nn.d6 >> 17);
  nn.d6 = (nn.d6 << 15) + (nn.d5 >> 17);
  nn.d5 = (nn.d5 << 15) + (nn.d4 >> 17);
  nn.d4 = (nn.d4 << 15) + (nn.d3 >> 17);
  nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
  nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
  nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
  nn.d0 =  nn.d0 << 15;

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc_cc(q.d7, nn.d7);
  q.d8 = __subc   (q.d8, nn.d8);

/********** Step 10, Offset 2^0 (0*32 + 0) **********/

  qf= __uint2float_rn(q.d8);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d7);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d6);

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q10: %X\n", qi);

  res->d0 = __add_cc (res->d0, qi);
  res->d1 = __addc_cc(res->d1, 0);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc_cc(res->d4, 0);
  res->d5 = __addc_cc(res->d5, 0);
  res->d6 = __addc_cc(res->d6, 0);
  res->d7 = __addc   (res->d7, 0);

// nn = n * qi
  nn.d0 =                                  __umul32(n.d0, qi);
  nn.d1 = __add_cc  (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc (__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc (__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc_cc (__umul32hi(n.d3, qi), __umul32(n.d4, qi));
  nn.d5 = __addc_cc (__umul32hi(n.d4, qi), __umul32(n.d5, qi));
  nn.d6 = __addc_cc (__umul32hi(n.d5, qi), __umul32(n.d6, qi));
  nn.d7 = __addc    (__umul32hi(n.d6, qi), __umul32(n.d7, qi));

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc_cc(q.d6, nn.d6);
  q.d7 = __subc   (q.d7, nn.d7);

/*
qi is always a little bit too small, this is OK for all steps except the last
one. Sometimes the result is a little bit bigger than n
*/

//if (blockIdx.x == 12 && threadIdx.x == 4)
//printf ("  rem: %X %X %X %X %X %X %X\r\n", q.d6, q.d5, q.d4, q.d3, q.d2, q.d1, q.d0);

  tmp256.d0=q.d0;
  tmp256.d1=q.d1;
  tmp256.d2=q.d2;
  tmp256.d3=q.d3;
  tmp256.d4=q.d4;
  tmp256.d5=q.d5;
  tmp256.d6=q.d6;
  tmp256.d7=q.d7;

  if(cmp_ge_256(tmp256,n))
  {
    res->d0 = __add_cc (res->d0, 1);
    res->d1 = __addc_cc(res->d1, 0);
    res->d2 = __addc_cc(res->d2, 0);
    res->d3 = __addc_cc(res->d3, 0);
    res->d4 = __addc_cc(res->d4, 0);
    res->d5 = __addc_cc(res->d5, 0);
    res->d6 = __addc_cc(res->d6, 0);
    res->d7 = __addc   (res->d7, 0);
  }
}


__device__ static void div_480_256(int256 *res, int480 q, int256 n, float nf)
/* res = q / n (integer division) */
{
	int512 tmp512;

	tmp512.d0 = q.d0;
	tmp512.d1 = q.d1;
	tmp512.d2 = q.d2;
	tmp512.d3 = q.d3;
	tmp512.d4 = q.d4;
	tmp512.d5 = q.d5;
	tmp512.d6 = q.d6;
	tmp512.d7 = q.d7;
	tmp512.d8 = q.d8;
	tmp512.d9 = q.d9;
	tmp512.d10 = q.d10;
	tmp512.d11 = q.d11;
	tmp512.d12 = q.d12;
	tmp512.d13 = q.d13;
	tmp512.d14 = q.d14;
	tmp512.d15 = 0;
	div_512_256(res, tmp512, n, nf);
}


__device__ static void mod_simple_256_224(int224 *res, int256 q, int224 n, float nf)
/*
res = q mod n
used for refinement in barrett modular multiplication
assumes q < Xn where X is a small integer
*/
{
  float qf;
  unsigned int qi;
  int256 nn;

  qf = __uint2float_rn(q.d7);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d6);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d5);

  qi=__float2uint_rz(qf*nf);

  nn.d0 =                           __umul32(n.d0, qi);
  nn.d1 = __umad32hi_cc  (n.d0, qi, __umul32(n.d1, qi));
  nn.d2 = __umad32hic_cc (n.d1, qi, __umul32(n.d2, qi));
  nn.d3 = __umad32hic_cc (n.d2, qi, __umul32(n.d3, qi));
  nn.d4 = __umad32hic_cc (n.d3, qi, __umul32(n.d4, qi));
  nn.d5 = __umad32hic_cc (n.d4, qi, __umul32(n.d5, qi));
  nn.d6 = __umad32hic_cc (n.d5, qi, __umul32(n.d6, qi));
  nn.d7 = __umad32hic    (n.d6, qi,                  0);

  res->d0 = __sub_cc (q.d0, nn.d0);
  res->d1 = __subc_cc(q.d1, nn.d1);
  res->d2 = __subc_cc(q.d2, nn.d2);
  res->d3 = __subc_cc(q.d3, nn.d3);
  res->d4 = __subc_cc(q.d4, nn.d4);
  res->d5 = __subc_cc(q.d5, nn.d5);
  res->d6 = __subc_cc(q.d6, nn.d6);
  q.d7    = __subc   (q.d7, nn.d7);

  if(q.d7 || cmp_ge_224(*res, n))		// final adjustment in case finalrem >= f
  {
    sub_224(res, *res, n);
  }
}


__device__ static void mod_simple_256(int256 *res, int256 q, int256 n, float nf)
/*
res = q mod n
used for refinement in barrett modular multiplication
assumes q < Xn where X is a small integer
*/
{
  float qf;
  unsigned int qi;
  int256 nn;

  qf = __uint2float_rn(q.d7);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d6);

  qi=__float2uint_rz(qf*nf);

  nn.d0 =                           __umul32(n.d0, qi);
  nn.d1 = __umad32hi_cc  (n.d0, qi, __umul32(n.d1, qi));
  nn.d2 = __umad32hic_cc (n.d1, qi, __umul32(n.d2, qi));
  nn.d3 = __umad32hic_cc (n.d2, qi, __umul32(n.d3, qi));
  nn.d4 = __umad32hic_cc (n.d3, qi, __umul32(n.d4, qi));
  nn.d5 = __umad32hic_cc (n.d4, qi, __umul32(n.d5, qi));
  nn.d6 = __umad32hic_cc (n.d5, qi, __umul32(n.d6, qi));
  nn.d7 = __umad32hic    (n.d6, qi, __umul32(n.d7, qi));

  sub_256(res, q, nn);

  if(cmp_ge_256(*res, n))			// final adjustment in case finalrem >= f
  {
    sub_256(res, *res, n);
  }
}
