/* genefer.c -- 1.3
   A program for finding large probable generalized Fermat primes.

Copyright (C) 2001-2002, Yves GALLOT, galloty@wanadoo.fr

GENEFER is free source code. You can redistribute, use and/or modify it.
Thank you to give a feedback to the author if improvement is realized.
It is distributed in the hope that it will be useful.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <string.h>
#if defined(_MSC_VER)
#	include <malloc.h>
#endif
#if	(defined(_MSC_VER) && defined(_M_IX86))
#	include <float.h>
#endif

#if !defined(M_PI)
#	define M_PI 3.1415926535897932364626
#endif

#if !defined(CONST)
#define	CONST		const
#endif

typedef struct
{
	double x, y;
} Complex;

#if defined(_MSC_VER)
	typedef	__int32				Int32;
	typedef	unsigned __int32	UInt32;
	typedef	unsigned __int64	UInt64;
#else
	typedef	int					Int32;
	typedef	unsigned int		UInt32;
	typedef	unsigned long long	UInt64;
#endif

/* depend on cache size */
#if !defined(FOUR_STEP_LIMIT)
#	define	FOUR_STEP_LIMIT	2048
#endif

/* depend on length and number of cache lines */
#if !defined(GAP)
#	define	GAP				4
#endif

#define	BLK_SIZE			16

static Complex e0[FOUR_STEP_LIMIT], e1[2*FOUR_STEP_LIMIT], e2[FOUR_STEP_LIMIT];
static Complex * e3;
static float maxErr;

#define	C5152	6755399441055744.0
#define	C6263	13835058055282163712.0

/* optimize this code for your compiler and processor */
#if	(defined(_MSC_VER) && defined(_M_IX86))
#	define VERSION	"x86"
#	define round(x)	((x + C6263) - C6263)
#	define B_MAX	35000000
#elif (defined(__sparc__) || defined(sparc) || defined(__sparc))
#	define VERSION	"Sparc"
#	define round(x)	((x + C5152) - C5152)
#	define B_MAX	30000000
#elif (defined(__alpha__) || defined(alpha) || defined(__alpha))
#	define VERSION	"Alpha"
#	define round(x)	((x + C5152) - C5152)
#	define B_MAX	30000000
#elif (defined(__POWERPC__) || defined(POWERPC) || defined(__POWERPC))
#	define VERSION	"PowerPC"
#	define round(x)	((x + C5152) - C5152)
#	define B_MAX	30000000
#elif (defined(__mips) || defined(mips) || defined(__mips))
#	define VERSION	"MIPS"
#	define round(x)	((x + C5152) - C5152)
#	define B_MAX	30000000
#else
#	define VERSION	"standard"
#	define round(x)	((x >= 0) ? floor(x + 0.5) : ceil(x - 0.5))
#	define B_MAX	30000000
#endif

#if	(defined(_MSC_VER) && defined(_M_IX86))
#	define	myAlloc(n)	_aligned_malloc(n, 64)
#	define	myFree(x)	_aligned_free(x)
#else
#	define	myAlloc(n)	malloc(n)
#	define	myFree(x)	free(x)
#endif

static UInt32 lpt(CONST UInt32 n)	/* least power of two greater than n */
{
	UInt32 i;
	for (i = 1; i < n; i *= 2);
	return i;
}

static UInt32 ilog(CONST UInt32 n)	/* floor log base 2 */
{
	UInt32 r, i;
	for (r = 0, i = n; i != 1; i /= 2, ++r);
	return r;
}

static unsigned int lg(CONST UInt32 * CONST a, CONST unsigned int len)
{
	UInt32 x;
	unsigned int l = (len - 1) * 8 * sizeof(UInt32) - 1;

	for (x = a[len-1]; x != 0; x /= 2)
		++l;

	return l;
}

static UInt32 mul_1_add_n(UInt32 * CONST a, CONST UInt32 n, CONST UInt32 * CONST b, CONST unsigned int len)
{
	unsigned int i;
	UInt64 l = 0;

	for (i = 0; i != len; ++i)
	{
		l += b[i] * (UInt64)n + a[i];
		a[i] = (UInt32)l;
		l >>= 32;
	}
	return (UInt32)l;
}

static void initExp(CONST unsigned int n1, CONST unsigned int n2)
{
	unsigned int i, j, k, l;
	Complex * ePtr;
	CONST double twoPi_n = 2 * M_PI / (n1 * n2);

	e3 = (Complex *)myAlloc(n2 * (BLK_SIZE + n1 / BLK_SIZE) * sizeof(Complex));
	for (j = 1; j != n2; ++j)
	{
		/* compute BitRev */
		for (i = n2 >> 1, k = 0, l = j; i != 0; i >>= 1, l >>= 1)
			k += k + (l & 1);

		ePtr = &e3[k * (BLK_SIZE + n1 / BLK_SIZE)];
		for (i = 0; i != BLK_SIZE; ++i)
		{
			ePtr->x = cos(twoPi_n * j * i);
			ePtr->y = sin(twoPi_n * j * i);
			++ePtr;
		}
		for (i = 0; i != n1; i += BLK_SIZE)
		{
			ePtr->x = cos(twoPi_n * j * i);
			ePtr->y = sin(twoPi_n * j * i);
			++ePtr;
		}
	}
}

static void FFTinit(CONST unsigned int n, CONST unsigned int n2)
{
	unsigned int i;
	CONST double twoPi_n = 2 * M_PI / n;
	double theta;

	e0[0].x = 0.5 * sqrt(2.0);
	for (i = 1; i != n / 4; ++i)
	{
		theta = twoPi_n * i;
		e0[3*i + 0].x = cos(1*theta);
		e0[3*i + 0].y = sin(1*theta);
		e0[3*i + 1].x = cos(2*theta);
		e0[3*i + 1].y = sin(2*theta);
		e0[3*i + 2].x = cos(3*theta);
		e0[3*i + 2].y = sin(3*theta);
	}

	if (n2 != 0)
		initExp(n, n2);
}

static void mulExp(Complex * CONST z, CONST unsigned int n, CONST unsigned int nsc)
{
	unsigned int i, j;
	double c, s, x, y;
	Complex * zPtr = &z[1];
	CONST Complex * ebPtr = &e3[nsc + 1];
	CONST Complex * eaPtr = &ebPtr[BLK_SIZE];

	for (j = BLK_SIZE - 1; j != 0; --j)
	{
		x = ebPtr->x * zPtr->x + ebPtr->y * zPtr->y;
		y = ebPtr->x * zPtr->y - ebPtr->y * zPtr->x;
		++ebPtr;
		zPtr->x = x;
		zPtr->y = y;
		++zPtr;
	}
	ebPtr -= BLK_SIZE - 1;

	for (i = n / BLK_SIZE - 1; i != 0; --i)
	{
		x = eaPtr->x * zPtr->x + eaPtr->y * zPtr->y;
		y = eaPtr->x * zPtr->y - eaPtr->y * zPtr->x;
		zPtr->x = x;
		zPtr->y = y;
		++zPtr;
		for (j = BLK_SIZE - 1; j != 0; --j)
		{
			c = eaPtr->x * ebPtr->x - eaPtr->y * ebPtr->y;
			s = eaPtr->x * ebPtr->y + eaPtr->y * ebPtr->x;
			++ebPtr;
			x = c * zPtr->x + s * zPtr->y;
			y = c * zPtr->y - s * zPtr->x;
			zPtr->x = x;
			zPtr->y = y;
			++zPtr;
		}
		ebPtr -= BLK_SIZE - 1;
		++eaPtr;
	}
}

static void FFTsquareFFT(Complex * CONST z, CONST unsigned int n, CONST unsigned int nsc)
{
	unsigned int i, j, m, l;
	Complex * zPtr, * z2Ptr;
	CONST Complex * ePtr;
	double x000, y000, x001, y001, x002, y002, x003, y003, x010, y010;
	double x100, y100, x101, y101, x102, y102, x103, y103, x110, y110;

	if (nsc != 0)
		mulExp(z, n, nsc);

	m = n / 4, l = 3;

	for (; m > 2; m /= 4, l *= 4)
	{
		zPtr  = &z[0];
		z2Ptr = &z[2*m];
		for (i = l; i != 0; i -= 3)
		{
			x000 = zPtr[0].x + z2Ptr[0].x;
			x002 = zPtr[m].x + z2Ptr[m].x;
			y000 = zPtr[0].y + z2Ptr[0].y;
			y002 = zPtr[m].y + z2Ptr[m].y;
			x001 = zPtr[0].x - z2Ptr[0].x;
			y003 = zPtr[m].y - z2Ptr[m].y;
			y001 = zPtr[0].y - z2Ptr[0].y;
			x003 = zPtr[m].x - z2Ptr[m].x;
			zPtr[0].x  = x000 + x002;
			zPtr[0].y  = y000 + y002;
			zPtr[m].x  = x000 - x002;
			zPtr[m].y  = y000 - y002;
			z2Ptr[0].x = x001 + y003;
			z2Ptr[0].y = y001 - x003;
			z2Ptr[m].x = x001 - y003;
			z2Ptr[m].y = y001 + x003;

			ePtr = &e0[l];

			x100 = zPtr[1].x     + z2Ptr[1].x;
			x102 = zPtr[m + 1].x + z2Ptr[m + 1].x;
			y100 = zPtr[1].y     + z2Ptr[1].y;
			y102 = zPtr[m + 1].y + z2Ptr[m + 1].y;
			x101 = zPtr[1].x     - z2Ptr[1].x;
			y103 = zPtr[m + 1].y - z2Ptr[m + 1].y;
			y101 = zPtr[1].y     - z2Ptr[1].y;
			x103 = zPtr[m + 1].x - z2Ptr[m + 1].x;
			zPtr[1].x = x100 + x102;
			zPtr[1].y = y100 + y102;
			x100 -= x102;
			y100 -= y102;
			x110 = x101 + y103;
			y110 = y101 - x103;
			x101 -= y103;
			y101 += x103;
			zPtr[m + 1].x  = ePtr[1].x * x100 + ePtr[1].y * y100;
			zPtr[m + 1].y  = ePtr[1].x * y100 - ePtr[1].y * x100;
			z2Ptr[1].x     = ePtr[0].x * x110 + ePtr[0].y * y110;
			z2Ptr[1].y     = ePtr[0].x * y110 - ePtr[0].y * x110;
			z2Ptr[m + 1].x = ePtr[2].x * x101 + ePtr[2].y * y101;
			z2Ptr[m + 1].y = ePtr[2].x * y101 - ePtr[2].y * x101;

			zPtr += 2; z2Ptr += 2;
			ePtr += l;

			for (j = m - 2; j != 0; j -= 2)
			{
				x000 = zPtr[0].x + z2Ptr[0].x;
				x002 = zPtr[m].x + z2Ptr[m].x;
				y000 = zPtr[0].y + z2Ptr[0].y;
				y002 = zPtr[m].y + z2Ptr[m].y;
				x001 = zPtr[0].x - z2Ptr[0].x;
				y003 = zPtr[m].y - z2Ptr[m].y;
				y001 = zPtr[0].y - z2Ptr[0].y;
				x003 = zPtr[m].x - z2Ptr[m].x;
				zPtr[0].x = x000 + x002;
				zPtr[0].y = y000 + y002;
				x000 -= x002;
				y000 -= y002;
				x010 = x001 + y003;
				y010 = y001 - x003;
				x001 -= y003;
				y001 += x003;
				zPtr[m].x  = ePtr[1].x * x000 + ePtr[1].y * y000;
				zPtr[m].y  = ePtr[1].x * y000 - ePtr[1].y * x000;
				z2Ptr[0].x = ePtr[0].x * x010 + ePtr[0].y * y010;
				z2Ptr[0].y = ePtr[0].x * y010 - ePtr[0].y * x010;
				z2Ptr[m].x = ePtr[2].x * x001 + ePtr[2].y * y001;
				z2Ptr[m].y = ePtr[2].x * y001 - ePtr[2].y * x001;

				x100 = zPtr[1].x     + z2Ptr[1].x;
				x102 = zPtr[m + 1].x + z2Ptr[m + 1].x;
				y100 = zPtr[1].y     + z2Ptr[1].y;
				y102 = zPtr[m + 1].y + z2Ptr[m + 1].y;
				x101 = zPtr[1].x     - z2Ptr[1].x;
				y103 = zPtr[m + 1].y - z2Ptr[m + 1].y;
				y101 = zPtr[1].y     - z2Ptr[1].y;
				x103 = zPtr[m + 1].x - z2Ptr[m + 1].x;
				zPtr[1].x = x100 + x102;
				zPtr[1].y = y100 + y102;
				x100 -= x102;
				y100 -= y102;
				x110 = x101 + y103;
				y110 = y101 - x103;
				x101 -= y103;
				y101 += x103;
				zPtr[m + 1].x  = ePtr[l + 1].x * x100 + ePtr[l + 1].y * y100;
				zPtr[m + 1].y  = ePtr[l + 1].x * y100 - ePtr[l + 1].y * x100;
				z2Ptr[1].x     = ePtr[l + 0].x * x110 + ePtr[l + 0].y * y110;
				z2Ptr[1].y     = ePtr[l + 0].x * y110 - ePtr[l + 0].y * x110;
				z2Ptr[m + 1].x = ePtr[l + 2].x * x101 + ePtr[l + 2].y * y101;
				z2Ptr[m + 1].y = ePtr[l + 2].x * y101 - ePtr[l + 2].y * x101;

				ePtr += 2*l;
				zPtr += 2; z2Ptr += 2;
			}
			zPtr += 3*m; z2Ptr += 3*m;
		}
	}

	if (m == 2)
	{
		zPtr = &z[0];
		x010 = e0[0].x;
		for (i = n; i != 0; i -= 8)
		{
			x000 = zPtr[0].x - zPtr[4].x;
			y000 = zPtr[0].y - zPtr[4].y;
			zPtr[0].x += zPtr[4].x;
			zPtr[0].y += zPtr[4].y;
			zPtr[4].x = x000;
			zPtr[4].y = y000;
			x001 = x010 * (zPtr[1].x - zPtr[5].x);
			y001 = x010 * (zPtr[1].y - zPtr[5].y);
			zPtr[1].x += zPtr[5].x;
			zPtr[1].y += zPtr[5].y;
			zPtr[5].x = x001 + y001;
			zPtr[5].y = y001 - x001;
			x002 = zPtr[6].x - zPtr[2].x;
			y002 = zPtr[2].y - zPtr[6].y;
			zPtr[2].x += zPtr[6].x;
			zPtr[2].y += zPtr[6].y;
			zPtr[6].x = y002;
			zPtr[6].y = x002;
			x003 = x010 * (zPtr[7].x - zPtr[3].x);
			y003 = x010 * (zPtr[7].y - zPtr[3].y);
			zPtr[3].x += zPtr[7].x;
			zPtr[3].y += zPtr[7].y;
			zPtr[7].x = x003 - y003;
			zPtr[7].y = y003 + x003;

			zPtr += 8;
		}
		l *= 2;
	}

	zPtr = &z[0];
	for (i = n; i != 0; i -= 8)
	{
		x000 = zPtr[0].x + zPtr[2].x;
		x002 = zPtr[1].x + zPtr[3].x;
		y000 = zPtr[0].y + zPtr[2].y;
		y002 = zPtr[1].y + zPtr[3].y;
		x001 = zPtr[0].x - zPtr[2].x;
		y003 = zPtr[1].y - zPtr[3].y;
		y001 = zPtr[0].y - zPtr[2].y;
		x003 = zPtr[1].x - zPtr[3].x;
		zPtr[0].x = x000 + x002;
		zPtr[0].y = y000 + y002;
		zPtr[1].x = x000 - x002;
		zPtr[1].y = y000 - y002;
		zPtr[2].x = x001 + y003;
		zPtr[2].y = y001 - x003;
		zPtr[3].x = x001 - y003;
		zPtr[3].y = y001 + x003;

		x100 = zPtr[4].x + zPtr[6].x;
		x102 = zPtr[5].x + zPtr[7].x;
		y100 = zPtr[4].y + zPtr[6].y;
		y102 = zPtr[5].y + zPtr[7].y;
		x101 = zPtr[4].x - zPtr[6].x;
		y103 = zPtr[5].y - zPtr[7].y;
		y101 = zPtr[4].y - zPtr[6].y;
		x103 = zPtr[5].x - zPtr[7].x;
		zPtr[4].x = x100 + x102;
		zPtr[4].y = y100 + y102;
		zPtr[5].x = x100 - x102;
		zPtr[5].y = y100 - y102;
		zPtr[6].x = x101 + y103;
		zPtr[6].y = y101 - x103;
		zPtr[7].x = x101 - y103;
		zPtr[7].y = y101 + x103;

		zPtr += 8;
	}

	zPtr = &z[0];
	for (i = n; i != 0; i -= 8)
	{
		x000 = zPtr[0].x + zPtr[0].y;
		y000 = zPtr[0].x - zPtr[0].y;
		x001 = zPtr[1].x + zPtr[1].y;
		y001 = zPtr[1].x - zPtr[1].y;
		x002 = zPtr[2].x + zPtr[2].y;
		y002 = zPtr[2].x - zPtr[2].y;
		x003 = zPtr[3].x + zPtr[3].y;
		y003 = zPtr[3].x - zPtr[3].y;
		zPtr[0].y *= -2 * zPtr[0].x;
		zPtr[0].x = x000 * y000;
		zPtr[1].y *=  2 * zPtr[1].x;
		zPtr[1].x = x001 * y001;
		zPtr[2].y *=  2 * zPtr[2].x;
		zPtr[2].x = x002 * y002;
		zPtr[3].y *=  2 * zPtr[3].x;
		zPtr[3].x = x003 * y003;

		x100 = zPtr[4].x + zPtr[4].y;
		y100 = zPtr[4].x - zPtr[4].y;
		x101 = zPtr[5].x + zPtr[5].y;
		y101 = zPtr[5].x - zPtr[5].y;
		x102 = zPtr[6].x + zPtr[6].y;
		y102 = zPtr[6].x - zPtr[6].y;
		x103 = zPtr[7].x + zPtr[7].y;
		y103 = zPtr[7].x - zPtr[7].y;
		zPtr[4].y *= -2 * zPtr[4].x;
		zPtr[4].x = x100 * y100;
		zPtr[5].y *=  2 * zPtr[5].x;
		zPtr[5].x = x101 * y101;
		zPtr[6].y *=  2 * zPtr[6].x;
		zPtr[6].x = x102 * y102;
		zPtr[7].y *=  2 * zPtr[7].x;
		zPtr[7].x = x103 * y103;

		zPtr += 8;
	}

	zPtr = &z[0];
	for (i = n; i != 0; i -= 8)
	{
		x000 = zPtr[0].x + zPtr[1].x;
		x001 = zPtr[0].x - zPtr[1].x;
		y000 = zPtr[0].y - zPtr[1].y;
		y001 = zPtr[0].y + zPtr[1].y;
		x002 = zPtr[2].x + zPtr[3].x;
		x003 = zPtr[2].x - zPtr[3].x;
		y002 = zPtr[2].y + zPtr[3].y;
		y003 = zPtr[2].y - zPtr[3].y;
		zPtr[0].x = x000 + x002;
		zPtr[2].x = x000 - x002;
		zPtr[1].y = y001 - x003;
		zPtr[3].y = y001 + x003;
		zPtr[0].y = y000 - y002;
		zPtr[2].y = y000 + y002;
		zPtr[1].x = x001 - y003;
		zPtr[3].x = x001 + y003;

		x100 = zPtr[4].x + zPtr[5].x;
		x101 = zPtr[4].x - zPtr[5].x;
		y100 = zPtr[4].y - zPtr[5].y;
		y101 = zPtr[4].y + zPtr[5].y;
		x102 = zPtr[6].x + zPtr[7].x;
		x103 = zPtr[6].x - zPtr[7].x;
		y102 = zPtr[6].y + zPtr[7].y;
		y103 = zPtr[6].y - zPtr[7].y;
		zPtr[4].x = x100 + x102;
		zPtr[6].x = x100 - x102;
		zPtr[5].y = y101 - x103;
		zPtr[7].y = y101 + x103;
		zPtr[4].y = y100 - y102;
		zPtr[6].y = y100 + y102;
		zPtr[5].x = x101 - y103;
		zPtr[7].x = x101 + y103;

		zPtr += 8;
	}

	if (m == 2)
	{
		l /= 2;
		zPtr = &z[0];
		x010 = e0[0].x;
		for (i = n; i != 0; i -= 8)
		{
			x000 = zPtr[0].x - zPtr[4].x;
			y000 = zPtr[0].y - zPtr[4].y;
			zPtr[0].x += zPtr[4].x;
			zPtr[0].y += zPtr[4].y;
			zPtr[4].x = x000;
			zPtr[4].y = y000;
			x001 = x010 * (zPtr[5].x + zPtr[5].y);
			y001 = x010 * (zPtr[5].y - zPtr[5].x);
			zPtr[5].x = zPtr[1].x - x001;
			zPtr[5].y = zPtr[1].y - y001;
			zPtr[1].x += x001;
			zPtr[1].y += y001;
			x002 = zPtr[2].x - zPtr[6].y;
			y002 = zPtr[2].y + zPtr[6].x;
			zPtr[2].x += zPtr[6].y;
			zPtr[2].y -= zPtr[6].x;
			zPtr[6].x = x002;
			zPtr[6].y = y002;
			x003 = x010 * (zPtr[7].x - zPtr[7].y);
			y003 = x010 * (zPtr[7].y + zPtr[7].x);
			zPtr[7].x = zPtr[3].x + x003;
			zPtr[7].y = zPtr[3].y + y003;
			zPtr[3].x -= x003;
			zPtr[3].y -= y003;

			zPtr += 8;
		}
	}

	m *= 4, l /= 4;

	for (; m != n; m *= 4, l /= 4)
	{
		zPtr  = &z[0];
		z2Ptr = &z[2*m];
		for (i = l; i != 0; i -= 3)
		{
			x000 = zPtr[0].x  + zPtr[m].x;
			x002 = z2Ptr[0].x + z2Ptr[m].x;
			y000 = zPtr[0].y  + zPtr[m].y;
			y002 = z2Ptr[0].y + z2Ptr[m].y;
			x001 = zPtr[0].x  - zPtr[m].x;
			y003 = z2Ptr[0].y - z2Ptr[m].y;
			y001 = zPtr[0].y  - zPtr[m].y;
			x003 = z2Ptr[0].x - z2Ptr[m].x;
			zPtr[0].x  = x000 + x002;
			zPtr[0].y  = y000 + y002;
			z2Ptr[0].x = x000 - x002;
			z2Ptr[0].y = y000 - y002;
			zPtr[m].x  = x001 + y003;
			zPtr[m].y  = y001 - x003;
			z2Ptr[m].x = x001 - y003;
			z2Ptr[m].y = y001 + x003;

			ePtr = &e0[l];

			x101 = ePtr[1].x * zPtr[m + 1].x  + ePtr[1].y * zPtr[m + 1].y;
			y101 = ePtr[1].x * zPtr[m + 1].y  - ePtr[1].y * zPtr[m + 1].x;
			x102 = ePtr[0].x * z2Ptr[1].x     + ePtr[0].y * z2Ptr[1].y;
			y102 = ePtr[0].x * z2Ptr[1].y     - ePtr[0].y * z2Ptr[1].x;
			x103 = ePtr[2].x * z2Ptr[m + 1].x + ePtr[2].y * z2Ptr[m + 1].y;
			y103 = ePtr[2].x * z2Ptr[m + 1].y - ePtr[2].y * z2Ptr[m + 1].x;
			x100 = zPtr[1].x + x101;
			x101 = zPtr[1].x - x101;
			y100 = zPtr[1].y + y101;
			y101 = zPtr[1].y - y101;
			x110 = x102 + x103;
			x102 -= x103;
			y110 = y102 + y103;
			y102 -= y103;
			zPtr[1].x      = x100 + x110;
			z2Ptr[1].x     = x100 - x110;
			zPtr[1].y      = y100 + y110;
			z2Ptr[1].y     = y100 - y110;
			zPtr[m + 1].x  = x101 + y102;
			z2Ptr[m + 1].x = x101 - y102;
			zPtr[m + 1].y  = y101 - x102;
			z2Ptr[m + 1].y = y101 + x102;

			ePtr += l;
			zPtr += 2; z2Ptr += 2;

			for (j = m - 2; j != 0; j -= 2)
			{
				x001 = ePtr[1].x * zPtr[m].x  + ePtr[1].y * zPtr[m].y;
				y001 = ePtr[1].x * zPtr[m].y  - ePtr[1].y * zPtr[m].x;
				x002 = ePtr[0].x * z2Ptr[0].x + ePtr[0].y * z2Ptr[0].y;
				y002 = ePtr[0].x * z2Ptr[0].y - ePtr[0].y * z2Ptr[0].x;
				x003 = ePtr[2].x * z2Ptr[m].x + ePtr[2].y * z2Ptr[m].y;
				y003 = ePtr[2].x * z2Ptr[m].y - ePtr[2].y * z2Ptr[m].x;
				x000 = zPtr[0].x + x001;
				x001 = zPtr[0].x - x001;
				y000 = zPtr[0].y + y001;
				y001 = zPtr[0].y - y001;
				x010 = x002 + x003;
				x002 -= x003;
				y010 = y002 + y003;
				y002 -= y003;
				zPtr[0].x  = x000 + x010;
				z2Ptr[0].x = x000 - x010;
				zPtr[0].y  = y000 + y010;
				z2Ptr[0].y = y000 - y010;
				zPtr[m].x  = x001 + y002;
				z2Ptr[m].x = x001 - y002;
				zPtr[m].y  = y001 - x002;
				z2Ptr[m].y = y001 + x002;

				x101 = ePtr[l + 1].x * zPtr[m + 1].x  + ePtr[l + 1].y * zPtr[m + 1].y;
				y101 = ePtr[l + 1].x * zPtr[m + 1].y  - ePtr[l + 1].y * zPtr[m + 1].x;
				x102 = ePtr[l + 0].x * z2Ptr[1].x     + ePtr[l + 0].y * z2Ptr[1].y;
				y102 = ePtr[l + 0].x * z2Ptr[1].y     - ePtr[l + 0].y * z2Ptr[1].x;
				x103 = ePtr[l + 2].x * z2Ptr[m + 1].x + ePtr[l + 2].y * z2Ptr[m + 1].y;
				y103 = ePtr[l + 2].x * z2Ptr[m + 1].y - ePtr[l + 2].y * z2Ptr[m + 1].x;
				x100 = zPtr[1].x + x101;
				x101 = zPtr[1].x - x101;
				y100 = zPtr[1].y + y101;
				y101 = zPtr[1].y - y101;
				x110 = x102 + x103;
				x102 -= x103;
				y110 = y102 + y103;
				y102 -= y103;
				zPtr[1].x      = x100 + x110;
				z2Ptr[1].x     = x100 - x110;
				zPtr[1].y      = y100 + y110;
				z2Ptr[1].y     = y100 - y110;
				zPtr[m + 1].x  = x101 + y102;
				z2Ptr[m + 1].x = x101 - y102;
				zPtr[m + 1].y  = y101 - x102;
				z2Ptr[m + 1].y = y101 + x102;

				ePtr += 2*l;
				zPtr += 2; z2Ptr += 2;
			}
			zPtr += 3*m; z2Ptr += 3*m;
		}
	}

	if (nsc != 0)
		mulExp(z, n, nsc);
}

static void FFTSinit(CONST unsigned int n)
{
	unsigned int i;
	CONST double twoPi_n = 2 * M_PI / n;
	double theta;

	e2[0].x = 0.5 * sqrt(2.0);
	for (i = 1; i != n / 4; ++i)
	{
		theta = twoPi_n * i;
		e2[3*i + 0].x = cos(1*theta);
		e2[3*i + 0].y = sin(1*theta);
		e2[3*i + 1].x = cos(2*theta);
		e2[3*i + 1].y = sin(2*theta);
		e2[3*i + 2].x = cos(3*theta);
		e2[3*i + 2].y = sin(3*theta);
	}
}

static void FFTStime(Complex * CONST z, CONST unsigned int n, CONST unsigned int s)
{
	unsigned int i, j, m, l;
	Complex * zPtr, * z2Ptr;
	CONST Complex * ePtr;
	double x000, y000, x001, y001, x002, y002, x003, y003, x010, y010;
	double x100, y100, x101, y101, x102, y102, x103, y103, x110, y110;
	double x200, y200, x201, y201, x202, y202, x203, y203, x210, y210;
	double x300, y300, x301, y301, x302, y302, x303, y303, x310, y310;

	m = s, l = 3 * n / 4;

	zPtr  = &z[0];
	z2Ptr = &z[2*s];
	for (i = n; i != 0; i -= 4)
	{
		x000 = zPtr[0].x  + zPtr[s].x;
		x002 = z2Ptr[0].x + z2Ptr[s].x;
		y000 = zPtr[0].y  + zPtr[s].y;
		y002 = z2Ptr[0].y + z2Ptr[s].y;
		x001 = zPtr[0].x  - zPtr[s].x;
		y003 = z2Ptr[0].y - z2Ptr[s].y;
		y001 = zPtr[0].y  - zPtr[s].y;
		x003 = z2Ptr[0].x - z2Ptr[s].x;
		zPtr[0].x  = x000 + x002;
		zPtr[0].y  = y000 + y002;
		z2Ptr[0].x = x000 - x002;
		z2Ptr[0].y = y000 - y002;
		zPtr[s].x  = x001 + y003;
		zPtr[s].y  = y001 - x003;
		z2Ptr[s].x = x001 - y003;
		z2Ptr[s].y = y001 + x003;

		x100 = zPtr[1].x  + zPtr[s + 1].x;
		x102 = z2Ptr[1].x + z2Ptr[s + 1].x;
		y100 = zPtr[1].y  + zPtr[s + 1].y;
		y102 = z2Ptr[1].y + z2Ptr[s + 1].y;
		x101 = zPtr[1].x  - zPtr[s + 1].x;
		y103 = z2Ptr[1].y - z2Ptr[s + 1].y;
		y101 = zPtr[1].y  - zPtr[s + 1].y;
		x103 = z2Ptr[1].x - z2Ptr[s + 1].x;
		zPtr[1].x  = x100 + x102;
		zPtr[1].y  = y100 + y102;
		z2Ptr[1].x = x100 - x102;
		z2Ptr[1].y = y100 - y102;
		zPtr[s + 1].x  = x101 + y103;
		zPtr[s + 1].y  = y101 - x103;
		z2Ptr[s + 1].x = x101 - y103;
		z2Ptr[s + 1].y = y101 + x103;

		x200 = zPtr[2].x  + zPtr[s + 2].x;
		x202 = z2Ptr[2].x + z2Ptr[s + 2].x;
		y200 = zPtr[2].y  + zPtr[s + 2].y;
		y202 = z2Ptr[2].y + z2Ptr[s + 2].y;
		x201 = zPtr[2].x  - zPtr[s + 2].x;
		y203 = z2Ptr[2].y - z2Ptr[s + 2].y;
		y201 = zPtr[2].y  - zPtr[s + 2].y;
		x203 = z2Ptr[2].x - z2Ptr[s + 2].x;
		zPtr[2].x  = x200 + x202;
		zPtr[2].y  = y200 + y202;
		z2Ptr[2].x = x200 - x202;
		z2Ptr[2].y = y200 - y202;
		zPtr[s + 2].x  = x201 + y203;
		zPtr[s + 2].y  = y201 - x203;
		z2Ptr[s + 2].x = x201 - y203;
		z2Ptr[s + 2].y = y201 + x203;

		x300 = zPtr[3].x  + zPtr[s + 3].x;
		x302 = z2Ptr[3].x + z2Ptr[s + 3].x;
		y300 = zPtr[3].y  + zPtr[s + 3].y;
		y302 = z2Ptr[3].y + z2Ptr[s + 3].y;
		x301 = zPtr[3].x  - zPtr[s + 3].x;
		y303 = z2Ptr[3].y - z2Ptr[s + 3].y;
		y301 = zPtr[3].y  - zPtr[s + 3].y;
		x303 = z2Ptr[3].x - z2Ptr[s + 3].x;
		zPtr[3].x  = x300 + x302;
		zPtr[3].y  = y300 + y302;
		z2Ptr[3].x = x300 - x302;
		z2Ptr[3].y = y300 - y302;
		zPtr[s + 3].x  = x301 + y303;
		zPtr[s + 3].y  = y301 - x303;
		z2Ptr[s + 3].x = x301 - y303;
		z2Ptr[s + 3].y = y301 + x303;

		zPtr += 4*s; z2Ptr += 4*s;
	}

	if ((ilog(n) & 1) != 0)
	{
		m *= 2, l /= 2;
		zPtr  = &z[0];
		z2Ptr = &z[4*s];
		x010  = e2[0].x;
		for (i = n; i != 0; i -= 8)
		{
			x000 = zPtr[0].x - z2Ptr[0].x;
			y000 = zPtr[0].y - z2Ptr[0].y;
			zPtr[0].x += z2Ptr[0].x;
			zPtr[0].y += z2Ptr[0].y;
			z2Ptr[0].x = x000;
			z2Ptr[0].y = y000;
			x001 = x010 * (z2Ptr[s].x + z2Ptr[s].y);
			y001 = x010 * (z2Ptr[s].y - z2Ptr[s].x);
			z2Ptr[s].x = zPtr[s].x - x001;
			z2Ptr[s].y = zPtr[s].y - y001;
			zPtr[s].x += x001;
			zPtr[s].y += y001;
			x002 = zPtr[2*s].x - z2Ptr[2*s].y;
			y002 = zPtr[2*s].y + z2Ptr[2*s].x;
			zPtr[2*s].x += z2Ptr[2*s].y;
			zPtr[2*s].y -= z2Ptr[2*s].x;
			z2Ptr[2*s].x = x002;
			z2Ptr[2*s].y = y002;
			x003 = x010 * (z2Ptr[3*s].x - z2Ptr[3*s].y);
			y003 = x010 * (z2Ptr[3*s].y + z2Ptr[3*s].x);
			z2Ptr[3*s].x = zPtr[3*s].x + x003;
			z2Ptr[3*s].y = zPtr[3*s].y + y003;
			zPtr[3*s].x -= x003;
			zPtr[3*s].y -= y003;

			x100 = zPtr[1].x - z2Ptr[1].x;
			y100 = zPtr[1].y - z2Ptr[1].y;
			zPtr[1].x += z2Ptr[1].x;
			zPtr[1].y += z2Ptr[1].y;
			z2Ptr[1].x = x100;
			z2Ptr[1].y = y100;
			x101 = x010 * (z2Ptr[s + 1].x + z2Ptr[s + 1].y);
			y101 = x010 * (z2Ptr[s + 1].y - z2Ptr[s + 1].x);
			z2Ptr[s + 1].x = zPtr[s + 1].x - x101;
			z2Ptr[s + 1].y = zPtr[s + 1].y - y101;
			zPtr[s + 1].x += x101;
			zPtr[s + 1].y += y101;
			x102 = zPtr[2*s + 1].x - z2Ptr[2*s + 1].y;
			y102 = zPtr[2*s + 1].y + z2Ptr[2*s + 1].x;
			zPtr[2*s + 1].x += z2Ptr[2*s + 1].y;
			zPtr[2*s + 1].y -= z2Ptr[2*s + 1].x;
			z2Ptr[2*s + 1].x = x102;
			z2Ptr[2*s + 1].y = y102;
			x103 = x010 * (z2Ptr[3*s + 1].x - z2Ptr[3*s + 1].y);
			y103 = x010 * (z2Ptr[3*s + 1].y + z2Ptr[3*s + 1].x);
			z2Ptr[3*s + 1].x = zPtr[3*s + 1].x + x103;
			z2Ptr[3*s + 1].y = zPtr[3*s + 1].y + y103;
			zPtr[3*s + 1].x -= x103;
			zPtr[3*s + 1].y -= y103;

			x200 = zPtr[2].x - z2Ptr[2].x;
			y200 = zPtr[2].y - z2Ptr[2].y;
			zPtr[2].x += z2Ptr[2].x;
			zPtr[2].y += z2Ptr[2].y;
			z2Ptr[2].x = x200;
			z2Ptr[2].y = y200;
			x201 = x010 * (z2Ptr[s + 2].x + z2Ptr[s + 2].y);
			y201 = x010 * (z2Ptr[s + 2].y - z2Ptr[s + 2].x);
			z2Ptr[s + 2].x = zPtr[s + 2].x - x201;
			z2Ptr[s + 2].y = zPtr[s + 2].y - y201;
			zPtr[s + 2].x += x201;
			zPtr[s + 2].y += y201;
			x202 = zPtr[2*s + 2].x - z2Ptr[2*s + 2].y;
			y202 = zPtr[2*s + 2].y + z2Ptr[2*s + 2].x;
			zPtr[2*s + 2].x += z2Ptr[2*s + 2].y;
			zPtr[2*s + 2].y -= z2Ptr[2*s + 2].x;
			z2Ptr[2*s + 2].x = x202;
			z2Ptr[2*s + 2].y = y202;
			x203 = x010 * (z2Ptr[3*s + 2].x - z2Ptr[3*s + 2].y);
			y203 = x010 * (z2Ptr[3*s + 2].y + z2Ptr[3*s + 2].x);
			z2Ptr[3*s + 2].x = zPtr[3*s + 2].x + x203;
			z2Ptr[3*s + 2].y = zPtr[3*s + 2].y + y203;
			zPtr[3*s + 2].x -= x203;
			zPtr[3*s + 2].y -= y203;

			x300 = zPtr[3].x - z2Ptr[3].x;
			y300 = zPtr[3].y - z2Ptr[3].y;
			zPtr[3].x += z2Ptr[3].x;
			zPtr[3].y += z2Ptr[3].y;
			z2Ptr[3].x = x300;
			z2Ptr[3].y = y300;
			x301 = x010 * (z2Ptr[s + 3].x + z2Ptr[s + 3].y);
			y301 = x010 * (z2Ptr[s + 3].y - z2Ptr[s + 3].x);
			z2Ptr[s + 3].x = zPtr[s + 3].x - x301;
			z2Ptr[s + 3].y = zPtr[s + 3].y - y301;
			zPtr[s + 3].x += x301;
			zPtr[s + 3].y += y301;
			x302 = zPtr[2*s + 3].x - z2Ptr[2*s + 3].y;
			y302 = zPtr[2*s + 3].y + z2Ptr[2*s + 3].x;
			zPtr[2*s + 3].x += z2Ptr[2*s + 3].y;
			zPtr[2*s + 3].y -= z2Ptr[2*s + 3].x;
			z2Ptr[2*s + 3].x = x302;
			z2Ptr[2*s + 3].y = y302;
			x303 = x010 * (z2Ptr[3*s + 3].x - z2Ptr[3*s + 3].y);
			y303 = x010 * (z2Ptr[3*s + 3].y + z2Ptr[3*s + 3].x);
			z2Ptr[3*s + 3].x = zPtr[3*s + 3].x + x303;
			z2Ptr[3*s + 3].y = zPtr[3*s + 3].y + y303;
			zPtr[3*s + 3].x -= x303;
			zPtr[3*s + 3].y -= y303;

			zPtr += 8*s; z2Ptr += 8*s;
		}
	}

	m *= 4, l /= 4;

	for (; l != 0; m *= 4, l /= 4)
	{
		zPtr  = &z[0];
		z2Ptr = &z[2*m];
		for (i = l; i != 0; i -= 3)
		{
			x000 = zPtr[0].x  + zPtr[m].x;
			x002 = z2Ptr[0].x + z2Ptr[m].x;
			y000 = zPtr[0].y  + zPtr[m].y;
			y002 = z2Ptr[0].y + z2Ptr[m].y;
			x001 = zPtr[0].x  - zPtr[m].x;
			y003 = z2Ptr[0].y - z2Ptr[m].y;
			y001 = zPtr[0].y  - zPtr[m].y;
			x003 = z2Ptr[0].x - z2Ptr[m].x;
			zPtr[0].x  = x000 + x002;
			zPtr[0].y  = y000 + y002;
			z2Ptr[0].x = x000 - x002;
			z2Ptr[0].y = y000 - y002;
			zPtr[m].x  = x001 + y003;
			zPtr[m].y  = y001 - x003;
			z2Ptr[m].x = x001 - y003;
			z2Ptr[m].y = y001 + x003;

			x100 = zPtr[1].x  + zPtr[m + 1].x;
			x102 = z2Ptr[1].x + z2Ptr[m + 1].x;
			y100 = zPtr[1].y  + zPtr[m + 1].y;
			y102 = z2Ptr[1].y + z2Ptr[m + 1].y;
			x101 = zPtr[1].x  - zPtr[m + 1].x;
			y103 = z2Ptr[1].y - z2Ptr[m + 1].y;
			y101 = zPtr[1].y  - zPtr[m + 1].y;
			x103 = z2Ptr[1].x - z2Ptr[m + 1].x;
			zPtr[1].x  = x100 + x102;
			zPtr[1].y  = y100 + y102;
			z2Ptr[1].x = x100 - x102;
			z2Ptr[1].y = y100 - y102;
			zPtr[m + 1].x  = x101 + y103;
			zPtr[m + 1].y  = y101 - x103;
			z2Ptr[m + 1].x = x101 - y103;
			z2Ptr[m + 1].y = y101 + x103;

			x200 = zPtr[2].x  + zPtr[m + 2].x;
			x202 = z2Ptr[2].x + z2Ptr[m + 2].x;
			y200 = zPtr[2].y  + zPtr[m + 2].y;
			y202 = z2Ptr[2].y + z2Ptr[m + 2].y;
			x201 = zPtr[2].x  - zPtr[m + 2].x;
			y203 = z2Ptr[2].y - z2Ptr[m + 2].y;
			y201 = zPtr[2].y  - zPtr[m + 2].y;
			x203 = z2Ptr[2].x - z2Ptr[m + 2].x;
			zPtr[2].x  = x200 + x202;
			zPtr[2].y  = y200 + y202;
			z2Ptr[2].x = x200 - x202;
			z2Ptr[2].y = y200 - y202;
			zPtr[m + 2].x  = x201 + y203;
			zPtr[m + 2].y  = y201 - x203;
			z2Ptr[m + 2].x = x201 - y203;
			z2Ptr[m + 2].y = y201 + x203;

			x300 = zPtr[3].x  + zPtr[m + 3].x;
			x302 = z2Ptr[3].x + z2Ptr[m + 3].x;
			y300 = zPtr[3].y  + zPtr[m + 3].y;
			y302 = z2Ptr[3].y + z2Ptr[m + 3].y;
			x301 = zPtr[3].x  - zPtr[m + 3].x;
			y303 = z2Ptr[3].y - z2Ptr[m + 3].y;
			y301 = zPtr[3].y  - zPtr[m + 3].y;
			x303 = z2Ptr[3].x - z2Ptr[m + 3].x;
			zPtr[3].x  = x300 + x302;
			zPtr[3].y  = y300 + y302;
			z2Ptr[3].x = x300 - x302;
			z2Ptr[3].y = y300 - y302;
			zPtr[m + 3].x  = x301 + y303;
			zPtr[m + 3].y  = y301 - x303;
			z2Ptr[m + 3].x = x301 - y303;
			z2Ptr[m + 3].y = y301 + x303;

			ePtr = &e2[l];
			zPtr += s; z2Ptr += s;

			for (j = m - s; j != 0; j -= s)
			{
				x001 = ePtr[1].x * zPtr[m].x  + ePtr[1].y * zPtr[m].y;
				y001 = ePtr[1].x * zPtr[m].y  - ePtr[1].y * zPtr[m].x;
				x002 = ePtr[0].x * z2Ptr[0].x + ePtr[0].y * z2Ptr[0].y;
				y002 = ePtr[0].x * z2Ptr[0].y - ePtr[0].y * z2Ptr[0].x;
				x003 = ePtr[2].x * z2Ptr[m].x + ePtr[2].y * z2Ptr[m].y;
				y003 = ePtr[2].x * z2Ptr[m].y - ePtr[2].y * z2Ptr[m].x;
				x000 = zPtr[0].x + x001;
				x001 = zPtr[0].x - x001;
				y000 = zPtr[0].y + y001;
				y001 = zPtr[0].y - y001;
				x010 = x002 + x003;
				x002 -= x003;
				y010 = y002 + y003;
				y002 -= y003;
				zPtr[0].x  = x000 + x010;
				z2Ptr[0].x = x000 - x010;
				zPtr[0].y  = y000 + y010;
				z2Ptr[0].y = y000 - y010;
				zPtr[m].x  = x001 + y002;
				z2Ptr[m].x = x001 - y002;
				zPtr[m].y  = y001 - x002;
				z2Ptr[m].y = y001 + x002;

				x101 = ePtr[1].x * zPtr[m + 1].x  + ePtr[1].y * zPtr[m + 1].y;
				y101 = ePtr[1].x * zPtr[m + 1].y  - ePtr[1].y * zPtr[m + 1].x;
				x102 = ePtr[0].x * z2Ptr[1].x + ePtr[0].y * z2Ptr[1].y;
				y102 = ePtr[0].x * z2Ptr[1].y - ePtr[0].y * z2Ptr[1].x;
				x103 = ePtr[2].x * z2Ptr[m + 1].x + ePtr[2].y * z2Ptr[m + 1].y;
				y103 = ePtr[2].x * z2Ptr[m + 1].y - ePtr[2].y * z2Ptr[m + 1].x;
				x100 = zPtr[1].x + x101;
				x101 = zPtr[1].x - x101;
				y100 = zPtr[1].y + y101;
				y101 = zPtr[1].y - y101;
				x110 = x102 + x103;
				x102 -= x103;
				y110 = y102 + y103;
				y102 -= y103;
				zPtr[1].x  = x100 + x110;
				z2Ptr[1].x = x100 - x110;
				zPtr[1].y  = y100 + y110;
				z2Ptr[1].y = y100 - y110;
				zPtr[m + 1].x  = x101 + y102;
				z2Ptr[m + 1].x = x101 - y102;
				zPtr[m + 1].y  = y101 - x102;
				z2Ptr[m + 1].y = y101 + x102;

				x201 = ePtr[1].x * zPtr[m + 2].x  + ePtr[1].y * zPtr[m + 2].y;
				y201 = ePtr[1].x * zPtr[m + 2].y  - ePtr[1].y * zPtr[m + 2].x;
				x202 = ePtr[0].x * z2Ptr[2].x + ePtr[0].y * z2Ptr[2].y;
				y202 = ePtr[0].x * z2Ptr[2].y - ePtr[0].y * z2Ptr[2].x;
				x203 = ePtr[2].x * z2Ptr[m + 2].x + ePtr[2].y * z2Ptr[m + 2].y;
				y203 = ePtr[2].x * z2Ptr[m + 2].y - ePtr[2].y * z2Ptr[m + 2].x;
				x200 = zPtr[2].x + x201;
				x201 = zPtr[2].x - x201;
				y200 = zPtr[2].y + y201;
				y201 = zPtr[2].y - y201;
				x210 = x202 + x203;
				x202 -= x203;
				y210 = y202 + y203;
				y202 -= y203;
				zPtr[2].x  = x200 + x210;
				z2Ptr[2].x = x200 - x210;
				zPtr[2].y  = y200 + y210;
				z2Ptr[2].y = y200 - y210;
				zPtr[m + 2].x  = x201 + y202;
				z2Ptr[m + 2].x = x201 - y202;
				zPtr[m + 2].y  = y201 - x202;
				z2Ptr[m + 2].y = y201 + x202;

				x301 = ePtr[1].x * zPtr[m + 3].x  + ePtr[1].y * zPtr[m + 3].y;
				y301 = ePtr[1].x * zPtr[m + 3].y  - ePtr[1].y * zPtr[m + 3].x;
				x302 = ePtr[0].x * z2Ptr[3].x + ePtr[0].y * z2Ptr[3].y;
				y302 = ePtr[0].x * z2Ptr[3].y - ePtr[0].y * z2Ptr[3].x;
				x303 = ePtr[2].x * z2Ptr[m + 3].x + ePtr[2].y * z2Ptr[m + 3].y;
				y303 = ePtr[2].x * z2Ptr[m + 3].y - ePtr[2].y * z2Ptr[m + 3].x;
				x300 = zPtr[3].x + x301;
				x301 = zPtr[3].x - x301;
				y300 = zPtr[3].y + y301;
				y301 = zPtr[3].y - y301;
				x310 = x302 + x303;
				x302 -= x303;
				y310 = y302 + y303;
				y302 -= y303;
				zPtr[3].x  = x300 + x310;
				z2Ptr[3].x = x300 - x310;
				zPtr[3].y  = y300 + y310;
				z2Ptr[3].y = y300 - y310;
				zPtr[m + 3].x  = x301 + y302;
				z2Ptr[m + 3].x = x301 - y302;
				zPtr[m + 3].y  = y301 - x302;
				z2Ptr[m + 3].y = y301 + x302;

				ePtr += l;
				zPtr += s; z2Ptr += s;
			}
			zPtr += 3*m; z2Ptr += 3*m;
		}
	}
}

static void FFTSfreq(Complex * CONST z, CONST unsigned int n, CONST unsigned int s)
{
	unsigned int i, j, m, l;
	Complex * zPtr, * z2Ptr;
	CONST Complex * ePtr;
	double x000, y000, x001, y001, x002, y002, x003, y003, x010, y010;
	double x100, y100, x101, y101, x102, y102, x103, y103, x110, y110;
	double x200, y200, x201, y201, x202, y202, x203, y203, x210, y210;
	double x300, y300, x301, y301, x302, y302, x303, y303, x310, y310;

	m = s * n / 4, l = 3;

	for (; m > 2*s; m /= 4, l *= 4)
	{
		zPtr  = &z[0];
		z2Ptr = &z[2*m];
		for (i = l; i != 0; i -= 3)
		{
			x000 = zPtr[0].x + z2Ptr[0].x;
			x002 = zPtr[m].x + z2Ptr[m].x;
			y000 = zPtr[0].y + z2Ptr[0].y;
			y002 = zPtr[m].y + z2Ptr[m].y;
			x001 = zPtr[0].x - z2Ptr[0].x;
			y003 = zPtr[m].y - z2Ptr[m].y;
			y001 = zPtr[0].y - z2Ptr[0].y;
			x003 = zPtr[m].x - z2Ptr[m].x;
			zPtr[0].x  = x000 + x002;
			zPtr[0].y  = y000 + y002;
			zPtr[m].x  = x000 - x002;
			zPtr[m].y  = y000 - y002;
			z2Ptr[0].x = x001 + y003;
			z2Ptr[0].y = y001 - x003;
			z2Ptr[m].x = x001 - y003;
			z2Ptr[m].y = y001 + x003;

			x100 = zPtr[1].x + z2Ptr[1].x;
			x102 = zPtr[m + 1].x + z2Ptr[m + 1].x;
			y100 = zPtr[1].y + z2Ptr[1].y;
			y102 = zPtr[m + 1].y + z2Ptr[m + 1].y;
			x101 = zPtr[1].x - z2Ptr[1].x;
			y103 = zPtr[m + 1].y - z2Ptr[m + 1].y;
			y101 = zPtr[1].y - z2Ptr[1].y;
			x103 = zPtr[m + 1].x - z2Ptr[m + 1].x;
			zPtr[1].x      = x100 + x102;
			zPtr[1].y      = y100 + y102;
			zPtr[m + 1].x  = x100 - x102;
			zPtr[m + 1].y  = y100 - y102;
			z2Ptr[1].x     = x101 + y103;
			z2Ptr[1].y     = y101 - x103;
			z2Ptr[m + 1].x = x101 - y103;
			z2Ptr[m + 1].y = y101 + x103;

			x200 = zPtr[2].x + z2Ptr[2].x;
			x202 = zPtr[m + 2].x + z2Ptr[m + 2].x;
			y200 = zPtr[2].y + z2Ptr[2].y;
			y202 = zPtr[m + 2].y + z2Ptr[m + 2].y;
			x201 = zPtr[2].x - z2Ptr[2].x;
			y203 = zPtr[m + 2].y - z2Ptr[m + 2].y;
			y201 = zPtr[2].y - z2Ptr[2].y;
			x203 = zPtr[m + 2].x - z2Ptr[m + 2].x;
			zPtr[2].x      = x200 + x202;
			zPtr[2].y      = y200 + y202;
			zPtr[m + 2].x  = x200 - x202;
			zPtr[m + 2].y  = y200 - y202;
			z2Ptr[2].x     = x201 + y203;
			z2Ptr[2].y     = y201 - x203;
			z2Ptr[m + 2].x = x201 - y203;
			z2Ptr[m + 2].y = y201 + x203;

			x300 = zPtr[3].x + z2Ptr[3].x;
			x302 = zPtr[m + 3].x + z2Ptr[m + 3].x;
			y300 = zPtr[3].y + z2Ptr[3].y;
			y302 = zPtr[m + 3].y + z2Ptr[m + 3].y;
			x301 = zPtr[3].x - z2Ptr[3].x;
			y303 = zPtr[m + 3].y - z2Ptr[m + 3].y;
			y301 = zPtr[3].y - z2Ptr[3].y;
			x303 = zPtr[m + 3].x - z2Ptr[m + 3].x;
			zPtr[3].x      = x300 + x302;
			zPtr[3].y      = y300 + y302;
			zPtr[m + 3].x  = x300 - x302;
			zPtr[m + 3].y  = y300 - y302;
			z2Ptr[3].x     = x301 + y303;
			z2Ptr[3].y     = y301 - x303;
			z2Ptr[m + 3].x = x301 - y303;
			z2Ptr[m + 3].y = y301 + x303;

			ePtr = &e2[l];
			zPtr += s; z2Ptr += s;

			for (j = m - s; j != 0; j -= s)
			{
				x000 = zPtr[0].x + z2Ptr[0].x;
				x002 = zPtr[m].x + z2Ptr[m].x;
				y000 = zPtr[0].y + z2Ptr[0].y;
				y002 = zPtr[m].y + z2Ptr[m].y;
				x001 = zPtr[0].x - z2Ptr[0].x;
				y003 = zPtr[m].y - z2Ptr[m].y;
				y001 = zPtr[0].y - z2Ptr[0].y;
				x003 = zPtr[m].x - z2Ptr[m].x;
				zPtr[0].x = x000 + x002;
				zPtr[0].y = y000 + y002;
				x000 -= x002;
				y000 -= y002;
				x010 = x001 + y003;
				y010 = y001 - x003;
				x001 -= y003;
				y001 += x003;
				zPtr[m].x  = ePtr[1].x * x000 + ePtr[1].y * y000;
				zPtr[m].y  = ePtr[1].x * y000 - ePtr[1].y * x000;
				z2Ptr[0].x = ePtr[0].x * x010 + ePtr[0].y * y010;
				z2Ptr[0].y = ePtr[0].x * y010 - ePtr[0].y * x010;
				z2Ptr[m].x = ePtr[2].x * x001 + ePtr[2].y * y001;
				z2Ptr[m].y = ePtr[2].x * y001 - ePtr[2].y * x001;

				x100 = zPtr[1].x + z2Ptr[1].x;
				x102 = zPtr[m + 1].x + z2Ptr[m + 1].x;
				y100 = zPtr[1].y + z2Ptr[1].y;
				y102 = zPtr[m + 1].y + z2Ptr[m + 1].y;
				x101 = zPtr[1].x - z2Ptr[1].x;
				y103 = zPtr[m + 1].y - z2Ptr[m + 1].y;
				y101 = zPtr[1].y - z2Ptr[1].y;
				x103 = zPtr[m + 1].x - z2Ptr[m + 1].x;
				zPtr[1].x = x100 + x102;
				zPtr[1].y = y100 + y102;
				x100 -= x102;
				y100 -= y102;
				x110 = x101 + y103;
				y110 = y101 - x103;
				x101 -= y103;
				y101 += x103;
				zPtr[m + 1].x  = ePtr[1].x * x100 + ePtr[1].y * y100;
				zPtr[m + 1].y  = ePtr[1].x * y100 - ePtr[1].y * x100;
				z2Ptr[1].x     = ePtr[0].x * x110 + ePtr[0].y * y110;
				z2Ptr[1].y     = ePtr[0].x * y110 - ePtr[0].y * x110;
				z2Ptr[m + 1].x = ePtr[2].x * x101 + ePtr[2].y * y101;
				z2Ptr[m + 1].y = ePtr[2].x * y101 - ePtr[2].y * x101;

				x200 = zPtr[2].x + z2Ptr[2].x;
				x202 = zPtr[m + 2].x + z2Ptr[m + 2].x;
				y200 = zPtr[2].y + z2Ptr[2].y;
				y202 = zPtr[m + 2].y + z2Ptr[m + 2].y;
				x201 = zPtr[2].x - z2Ptr[2].x;
				y203 = zPtr[m + 2].y - z2Ptr[m + 2].y;
				y201 = zPtr[2].y - z2Ptr[2].y;
				x203 = zPtr[m + 2].x - z2Ptr[m + 2].x;
				zPtr[2].x = x200 + x202;
				zPtr[2].y = y200 + y202;
				x200 -= x202;
				y200 -= y202;
				x210 = x201 + y203;
				y210 = y201 - x203;
				x201 -= y203;
				y201 += x203;
				zPtr[m + 2].x  = ePtr[1].x * x200 + ePtr[1].y * y200;
				zPtr[m + 2].y  = ePtr[1].x * y200 - ePtr[1].y * x200;
				z2Ptr[2].x     = ePtr[0].x * x210 + ePtr[0].y * y210;
				z2Ptr[2].y     = ePtr[0].x * y210 - ePtr[0].y * x210;
				z2Ptr[m + 2].x = ePtr[2].x * x201 + ePtr[2].y * y201;
				z2Ptr[m + 2].y = ePtr[2].x * y201 - ePtr[2].y * x201;

				x300 = zPtr[3].x + z2Ptr[3].x;
				x302 = zPtr[m + 3].x + z2Ptr[m + 3].x;
				y300 = zPtr[3].y + z2Ptr[3].y;
				y302 = zPtr[m + 3].y + z2Ptr[m + 3].y;
				x301 = zPtr[3].x - z2Ptr[3].x;
				y303 = zPtr[m + 3].y - z2Ptr[m + 3].y;
				y301 = zPtr[3].y - z2Ptr[3].y;
				x303 = zPtr[m + 3].x - z2Ptr[m + 3].x;
				zPtr[3].x = x300 + x302;
				zPtr[3].y = y300 + y302;
				x300 -= x302;
				y300 -= y302;
				x310 = x301 + y303;
				y310 = y301 - x303;
				x301 -= y303;
				y301 += x303;
				zPtr[m + 3].x  = ePtr[1].x * x300 + ePtr[1].y * y300;
				zPtr[m + 3].y  = ePtr[1].x * y300 - ePtr[1].y * x300;
				z2Ptr[3].x     = ePtr[0].x * x310 + ePtr[0].y * y310;
				z2Ptr[3].y     = ePtr[0].x * y310 - ePtr[0].y * x310;
				z2Ptr[m + 3].x = ePtr[2].x * x301 + ePtr[2].y * y301;
				z2Ptr[m + 3].y = ePtr[2].x * y301 - ePtr[2].y * x301;

				ePtr += l;
				zPtr += s; z2Ptr += s;
			}
			zPtr += 3*m; z2Ptr += 3*m;
		}
	}

	if (m == 2*s)
	{
		zPtr  = &z[0];
		z2Ptr = &z[4*s];
		x010 = e0[0].x;
		for (i = n; i != 0; i -= 8)
		{
			x000 = zPtr[0].x - z2Ptr[0].x;
			y000 = zPtr[0].y - z2Ptr[0].y;
			zPtr[0].x += z2Ptr[0].x;
			zPtr[0].y += z2Ptr[0].y;
			z2Ptr[0].x = x000;
			z2Ptr[0].y = y000;
			x001 = x010 * (zPtr[s].x - z2Ptr[s].x);
			y001 = x010 * (zPtr[s].y - z2Ptr[s].y);
			zPtr[s].x += z2Ptr[s].x;
			zPtr[s].y += z2Ptr[s].y;
			z2Ptr[s].x = x001 + y001;
			z2Ptr[s].y = y001 - x001;
			x002 = z2Ptr[2*s].x - zPtr[2*s].x;
			y002 = zPtr[2*s].y  - z2Ptr[2*s].y;
			zPtr[2*s].x += z2Ptr[2*s].x;
			zPtr[2*s].y += z2Ptr[2*s].y;
			z2Ptr[2*s].x = y002;
			z2Ptr[2*s].y = x002;
			x003 = x010 * (z2Ptr[3*s].x - zPtr[3*s].x);
			y003 = x010 * (z2Ptr[3*s].y - zPtr[3*s].y);
			zPtr[3*s].x += z2Ptr[3*s].x;
			zPtr[3*s].y += z2Ptr[3*s].y;
			z2Ptr[3*s].x = x003 - y003;
			z2Ptr[3*s].y = y003 + x003;

			x100 = zPtr[1].x - z2Ptr[1].x;
			y100 = zPtr[1].y - z2Ptr[1].y;
			zPtr[1].x += z2Ptr[1].x;
			zPtr[1].y += z2Ptr[1].y;
			z2Ptr[1].x = x100;
			z2Ptr[1].y = y100;
			x101 = x010 * (zPtr[s + 1].x - z2Ptr[s + 1].x);
			y101 = x010 * (zPtr[s + 1].y - z2Ptr[s + 1].y);
			zPtr[s + 1].x += z2Ptr[s + 1].x;
			zPtr[s + 1].y += z2Ptr[s + 1].y;
			z2Ptr[s + 1].x = x101 + y101;
			z2Ptr[s + 1].y = y101 - x101;
			x102 = z2Ptr[2*s + 1].x - zPtr[2*s + 1].x;
			y102 = zPtr[2*s + 1].y  - z2Ptr[2*s + 1].y;
			zPtr[2*s + 1].x += z2Ptr[2*s + 1].x;
			zPtr[2*s + 1].y += z2Ptr[2*s + 1].y;
			z2Ptr[2*s + 1].x = y102;
			z2Ptr[2*s + 1].y = x102;
			x103 = x010 * (z2Ptr[3*s + 1].x - zPtr[3*s + 1].x);
			y103 = x010 * (z2Ptr[3*s + 1].y - zPtr[3*s + 1].y);
			zPtr[3*s + 1].x += z2Ptr[3*s + 1].x;
			zPtr[3*s + 1].y += z2Ptr[3*s + 1].y;
			z2Ptr[3*s + 1].x = x103 - y103;
			z2Ptr[3*s + 1].y = y103 + x103;

			x200 = zPtr[2].x - z2Ptr[2].x;
			y200 = zPtr[2].y - z2Ptr[2].y;
			zPtr[2].x += z2Ptr[2].x;
			zPtr[2].y += z2Ptr[2].y;
			z2Ptr[2].x = x200;
			z2Ptr[2].y = y200;
			x201 = x010 * (zPtr[s + 2].x - z2Ptr[s + 2].x);
			y201 = x010 * (zPtr[s + 2].y - z2Ptr[s + 2].y);
			zPtr[s + 2].x += z2Ptr[s + 2].x;
			zPtr[s + 2].y += z2Ptr[s + 2].y;
			z2Ptr[s + 2].x = x201 + y201;
			z2Ptr[s + 2].y = y201 - x201;
			x202 = z2Ptr[2*s + 2].x - zPtr[2*s + 2].x;
			y202 = zPtr[2*s + 2].y  - z2Ptr[2*s + 2].y;
			zPtr[2*s + 2].x += z2Ptr[2*s + 2].x;
			zPtr[2*s + 2].y += z2Ptr[2*s + 2].y;
			z2Ptr[2*s + 2].x = y202;
			z2Ptr[2*s + 2].y = x202;
			x203 = x010 * (z2Ptr[3*s + 2].x - zPtr[3*s + 2].x);
			y203 = x010 * (z2Ptr[3*s + 2].y - zPtr[3*s + 2].y);
			zPtr[3*s + 2].x += z2Ptr[3*s + 2].x;
			zPtr[3*s + 2].y += z2Ptr[3*s + 2].y;
			z2Ptr[3*s + 2].x = x203 - y203;
			z2Ptr[3*s + 2].y = y203 + x203;

			x300 = zPtr[3].x - z2Ptr[3].x;
			y300 = zPtr[3].y - z2Ptr[3].y;
			zPtr[3].x += z2Ptr[3].x;
			zPtr[3].y += z2Ptr[3].y;
			z2Ptr[3].x = x300;
			z2Ptr[3].y = y300;
			x301 = x010 * (zPtr[s + 3].x - z2Ptr[s + 3].x);
			y301 = x010 * (zPtr[s + 3].y - z2Ptr[s + 3].y);
			zPtr[s + 3].x += z2Ptr[s + 3].x;
			zPtr[s + 3].y += z2Ptr[s + 3].y;
			z2Ptr[s + 3].x = x301 + y301;
			z2Ptr[s + 3].y = y301 - x301;
			x302 = z2Ptr[2*s + 3].x - zPtr[2*s + 3].x;
			y302 = zPtr[2*s + 3].y  - z2Ptr[2*s + 3].y;
			zPtr[2*s + 3].x += z2Ptr[2*s + 3].x;
			zPtr[2*s + 3].y += z2Ptr[2*s + 3].y;
			z2Ptr[2*s + 3].x = y302;
			z2Ptr[2*s + 3].y = x302;
			x303 = x010 * (z2Ptr[3*s + 3].x - zPtr[3*s + 3].x);
			y303 = x010 * (z2Ptr[3*s + 3].y - zPtr[3*s + 3].y);
			zPtr[3*s + 3].x += z2Ptr[3*s + 3].x;
			zPtr[3*s + 3].y += z2Ptr[3*s + 3].y;
			z2Ptr[3*s + 3].x = x303 - y303;
			z2Ptr[3*s + 3].y = y303 + x303;

			zPtr += 8*s; z2Ptr += 8*s;
		}
	}

	zPtr  = &z[0];
	z2Ptr = &z[2*s];
	for (i = n; i != 0; i -= 4)
	{
		x000 = zPtr[0].x + z2Ptr[0].x;
		x002 = zPtr[s].x + z2Ptr[s].x;
		y000 = zPtr[0].y + z2Ptr[0].y;
		y002 = zPtr[s].y + z2Ptr[s].y;
		x001 = zPtr[0].x - z2Ptr[0].x;
		y003 = zPtr[s].y - z2Ptr[s].y;
		y001 = zPtr[0].y - z2Ptr[0].y;
		x003 = zPtr[s].x - z2Ptr[s].x;
		zPtr[0].x  = x000 + x002;
		zPtr[0].y  = y000 + y002;
		zPtr[s].x  = x000 - x002;
		zPtr[s].y  = y000 - y002;
		z2Ptr[0].x = x001 + y003;
		z2Ptr[0].y = y001 - x003;
		z2Ptr[s].x = x001 - y003;
		z2Ptr[s].y = y001 + x003;

		x100 = zPtr[1].x + z2Ptr[1].x;
		x102 = zPtr[s + 1].x + z2Ptr[s + 1].x;
		y100 = zPtr[1].y + z2Ptr[1].y;
		y102 = zPtr[s + 1].y + z2Ptr[s + 1].y;
		x101 = zPtr[1].x - z2Ptr[1].x;
		y103 = zPtr[s + 1].y - z2Ptr[s + 1].y;
		y101 = zPtr[1].y - z2Ptr[1].y;
		x103 = zPtr[s + 1].x - z2Ptr[s + 1].x;
		zPtr[1].x      = x100 + x102;
		zPtr[1].y      = y100 + y102;
		zPtr[s + 1].x  = x100 - x102;
		zPtr[s + 1].y  = y100 - y102;
		z2Ptr[1].x     = x101 + y103;
		z2Ptr[1].y     = y101 - x103;
		z2Ptr[s + 1].x = x101 - y103;
		z2Ptr[s + 1].y = y101 + x103;

		x200 = zPtr[2].x + z2Ptr[2].x;
		x202 = zPtr[s + 2].x + z2Ptr[s + 2].x;
		y200 = zPtr[2].y + z2Ptr[2].y;
		y202 = zPtr[s + 2].y + z2Ptr[s + 2].y;
		x201 = zPtr[2].x - z2Ptr[2].x;
		y203 = zPtr[s + 2].y - z2Ptr[s + 2].y;
		y201 = zPtr[2].y - z2Ptr[2].y;
		x203 = zPtr[s + 2].x - z2Ptr[s + 2].x;
		zPtr[2].x      = x200 + x202;
		zPtr[2].y      = y200 + y202;
		zPtr[s + 2].x  = x200 - x202;
		zPtr[s + 2].y  = y200 - y202;
		z2Ptr[2].x     = x201 + y203;
		z2Ptr[2].y     = y201 - x203;
		z2Ptr[s + 2].x = x201 - y203;
		z2Ptr[s + 2].y = y201 + x203;

		x300 = zPtr[3].x + z2Ptr[3].x;
		x302 = zPtr[s + 3].x + z2Ptr[s + 3].x;
		y300 = zPtr[3].y + z2Ptr[3].y;
		y302 = zPtr[s + 3].y + z2Ptr[s + 3].y;
		x301 = zPtr[3].x - z2Ptr[3].x;
		y303 = zPtr[s + 3].y - z2Ptr[s + 3].y;
		y301 = zPtr[3].y - z2Ptr[3].y;
		x303 = zPtr[s + 3].x - z2Ptr[s + 3].x;
		zPtr[3].x      = x300 + x302;
		zPtr[3].y      = y300 + y302;
		zPtr[s + 3].x  = x300 - x302;
		zPtr[s + 3].y  = y300 - y302;
		z2Ptr[3].x     = x301 + y303;
		z2Ptr[3].y     = y301 - x303;
		z2Ptr[s + 3].x = x301 - y303;
		z2Ptr[s + 3].y = y301 + x303;

		zPtr += 4*s; z2Ptr += 4*s;
	}
}

static void fourStepSquare(Complex * CONST z, CONST unsigned int n1, CONST unsigned int n2)
{
	unsigned int i, j;
	Complex * zPtr;

	zPtr = z;
	for (i = n1 / 4; i != 0; --i)
	{
		FFTSfreq(zPtr, n2, n1 + GAP);
		zPtr += 4;
	}

	zPtr = z;
	for (i = n2, j = 0; i != 0; --i, j += BLK_SIZE + n1 / BLK_SIZE)
	{
		FFTsquareFFT(zPtr, n1, j);
		zPtr += n1 + GAP;
	}

	zPtr = z;
	for (i = n1 / 4; i != 0; --i)
	{
		FFTStime(zPtr, n2, n1 + GAP);
		zPtr += 4;
	}
}

static void convertInitGFN(CONST unsigned int n1, CONST unsigned int n2)
{
	unsigned int i;
	CONST double Pi_Twon  = -M_PI / (2 * n1 * n2);
	CONST double Pi_Twon2 = -M_PI / (2 * n2);

	for (i = 0; i != n1; ++i)
	{
		e1[i].x = cos(Pi_Twon * i);
		e1[i].y = sin(Pi_Twon * i);
	}

	for (i = 0; i != n2; ++i)
	{
		e1[n1 + i].x = cos(Pi_Twon2 * i);
		e1[n1 + i].y = sin(Pi_Twon2 * i);
	}
}

static Complex * FFTinitGFN(CONST UInt32 m, CONST double x, unsigned int * CONST rn1, unsigned int * CONST rn2)
{
	unsigned int n1, n2, l;
	Complex * z;

	if (m / 2 <= FOUR_STEP_LIMIT)
	{
		n1 = m / 2;
		n2 = 1;
		FFTinit(n1, 0);
	}
	else
	{
		l = ilog(m / 2);
		n1 = 1 << (l - (l - 1) / 2);
		n2 = 1 << ((l - 1) / 2);
		FFTinit(n1, n2);
		FFTSinit(n2);
	}

	convertInitGFN(n1, n2);

	z = (Complex *)myAlloc((n1 + GAP) * n2 * sizeof(Complex));
	z[0].x = x; z[0].y = 0;
	for (l = 1; l < (n1 + GAP) * n2; ++l)
		z[l].x = z[l].y = 0;

	maxErr = 0;

	*rn1 = n1; *rn2 = n2;
	return z;
}

static void FFTfree(CONST unsigned int n2)
{
	if (n2 > 1)
	{
		myFree(e3);
	}
}

static void FFTsquareGFN(Complex * CONST z, CONST unsigned int n1, CONST unsigned int n2)
{
	if (n2 == 1)
		FFTsquareFFT(z, n1, 0);
	else
		fourStepSquare(z, n1, n2);
}

static void FFTnextStepGFN(Complex * CONST z, CONST Int32 base, CONST double t, CONST unsigned int n1, CONST unsigned int n2)
{
	unsigned int i, j;
	Complex * zPtr;
	CONST Complex * eiPtr, * ejPtr;
	double c, s, x, y, xi, yi, f1, f2;
	CONST double invBase = 1.0 / base;
	float errx, erry;

	f1 = f2 = 0;
	zPtr = z;

	eiPtr = e1;
	for (i = n1; i != 0; --i)
	{
		x = f1 + t * (eiPtr->x * zPtr->x + eiPtr->y * zPtr->y);
		y = f2 + t * (eiPtr->y * zPtr->x - eiPtr->x * zPtr->y);
		xi = round(x);
		yi = round(y);

		errx = (float)fabs(x - xi);
		if (errx > maxErr) maxErr = errx;
		erry = (float)fabs(y - yi);
		if (erry > maxErr) maxErr = erry;

		x = xi * invBase;
		y = yi * invBase;
		f1 = round(x);
		f2 = round(y);
		xi -= f1 * base;
		yi -= f2 * base;

		zPtr->x = eiPtr->x * xi + eiPtr->y * yi;
		zPtr->y = eiPtr->x * yi - eiPtr->y * xi;
		++eiPtr;
		++zPtr;
	}
	zPtr += GAP;

	ejPtr = &e1[n1 + 1];
	for (j = n2 - 1; j != 0; --j)
	{
		c = ejPtr->x; s = ejPtr->y;
		eiPtr = &e1[1];
		for (i = n1; i != 0; --i)
		{
			x = f1 + t * (c * zPtr->x + s * zPtr->y);
			y = f2 + t * (s * zPtr->x - c * zPtr->y);
			xi = round(x);
			yi = round(y);

			errx = (float)fabs(x - xi);
			if (errx > maxErr) maxErr = errx;
			erry = (float)fabs(y - yi);
			if (erry > maxErr) maxErr = erry;

			x = xi * invBase;
			y = yi * invBase;
			f1 = round(x);
			f2 = round(y);
			xi -= f1 * base;
			yi -= f2 * base;
			zPtr->x = c * xi + s * yi;
			zPtr->y = c * yi - s * xi;
			++zPtr;

			c = ejPtr->x * eiPtr->x - ejPtr->y * eiPtr->y;
			s = ejPtr->x * eiPtr->y + ejPtr->y * eiPtr->x;
			++eiPtr;
		}
		zPtr += GAP;
		++ejPtr;
	}

	f2 = -f2;	/* a[n] = -a[0] */

	zPtr = z;
	eiPtr = e1;
	for (i = 2; i != 0; --i)
	{
		x = f2 + eiPtr->x * zPtr->x - eiPtr->y * zPtr->y;
		y = f1 + eiPtr->x * zPtr->y + eiPtr->y * zPtr->x;
		xi = round(x);
		yi = round(y);

		errx = (float)fabs(x - xi);
		if (errx > maxErr) maxErr = errx;
		erry = (float)fabs(y - yi);
		if (erry > maxErr) maxErr = erry;

		x = xi * invBase;
		y = yi * invBase;
		f2 = round(x);
		f1 = round(y);
		xi -= f2 * base;
		yi -= f1 * base;

		zPtr->x = eiPtr->x * xi + eiPtr->y * yi;
		zPtr->y = eiPtr->x * yi - eiPtr->y * xi;
		++eiPtr;
		++zPtr;
	}

	x = eiPtr->x * zPtr->x - eiPtr->y * zPtr->y;
	y = eiPtr->x * zPtr->y + eiPtr->y * zPtr->x;
	xi = round(x);
	yi = round(y);
	errx = (float)fabs(x - xi);
	if (errx > maxErr) maxErr = errx;
	erry = (float)fabs(y - yi);
	if (erry > maxErr) maxErr = erry;
	f2 += xi;
	f1 += yi;
	zPtr->x = eiPtr->x * f2 + eiPtr->y * f1;
	zPtr->y = eiPtr->x * f1 - eiPtr->y * f2;
}

static void FFTgetResult(CONST Complex * CONST z, CONST Int32 base, Int32 * CONST a, CONST unsigned int n1, CONST unsigned int n2, float * CONST maxFFTerror)
{
	unsigned int i, j, k;
	CONST Complex * zPtr, * eiPtr, * ejPtr;
	CONST unsigned int n = n1 * n2;
	double c, s, x, y, xi, yi, f1, f2;
	CONST double t = 1.0 / n, invBase = 1.0 / base;
	float errx, erry;

	k = 0;
	f1 = f2 = 0;
	zPtr = z;
	ejPtr = &e1[n1];
	for (j = n2; j != 0; --j)
	{
		eiPtr = e1;
		for (i = n1; i != 0; --i)
		{
			c = ejPtr->x * eiPtr->x - ejPtr->y * eiPtr->y;
			s = ejPtr->x * eiPtr->y + ejPtr->y * eiPtr->x;
			++eiPtr;

			x = f1 + t * (c * zPtr->x + s * zPtr->y);
			y = f2 + t * (s * zPtr->x - c * zPtr->y);
			++zPtr;
			xi = round(x);
			yi = round(y);

			errx = (float)fabs(x - xi);
			if (errx > maxErr) maxErr = errx;
			erry = (float)fabs(y - yi);
			if (erry > maxErr) maxErr = erry;

			x = xi * invBase;
			y = yi * invBase;
			f1 = round(x);
			f2 = round(y);
			x = xi - f1 * base;
			y = yi - f2 * base;
			xi = round(x);
			yi = round(y);
			a[k]     = (Int32)xi;
			a[k + n] = (Int32)yi;
			++k;
		}
		zPtr += GAP;
		++ejPtr;
	}

	f2 = -f2;	/* a[n] = -a[0] */

	for (k = 0; k != 9; ++k)
	{
		yi = f2 + a[k];
		xi = f1 + a[k + n];
		y = yi * invBase;
		x = xi * invBase;
		f2 = round(y);
		f1 = round(x);
		a[k]     = (Int32)(yi - f2 * base);
		a[k + n] = (Int32)(xi - f1 * base);
	}
	a[9]     += (Int32)f2;
	a[9 + n] += (Int32)f1;

	*maxFFTerror = maxErr;
}

static int isOne(Int32 * CONST a, CONST unsigned int len, CONST Int32 base)
{
	unsigned int i;
	Int32 f, r;
	int rt;

	do
	{
		f = 0;
		for (i = 0; i != len; ++i)
		{
			f += a[i];
			r = f % base;
			if (r < 0) r += base;
			a[i] = r;
			f -= r;
			f /= base;
		}
		a[0] -= f; 	/* a[n] = -a[0] */
	} while (f != 0);

	rt = 1;
	for (i = 1; i != len; ++i)
	{
		if (a[i] != 0)
		{
			rt = 0;
			break;
		}
	}

	if (a[0] != 1)
		rt = 0;

	return rt;
}

static int check(CONST Int32 b, CONST UInt32 m)
{
	UInt32 j;
	unsigned int i, Nlen, bt, n1, n2;
	UInt32 * Na, * a;
	Int32 * Ra;
	Complex * z;
	float maxError;
	int r;
	CONST double t1 = 2.0 / m, t2 = 2 * t1;
	FILE * fp;
	char str[132];

	printf("Testing %d^%u+1...\r", b, m);

	Nlen = 1;
	Na = (UInt32 *)myAlloc(1 * sizeof(UInt32));
	Na[0] = (UInt32)b;
	for (j = m; j != 1; j /= 2)
	{
		a = (UInt32 *)myAlloc(2 * Nlen * sizeof(UInt32));
		for (i = 0; i != Nlen; ++i)
			a[i] = 0;

		for (i = 0; i != Nlen; ++i)
			a[i + Nlen] = mul_1_add_n(&a[i], Na[i], Na, Nlen);

		myFree(Na);
		Na = a;
		Nlen *= 2;
		if (Na[Nlen - 1] == 0) --Nlen;
	}

	z = FFTinitGFN(m, 2.0, &n1, &n2);
	for (i = lg(Na, Nlen) - 1; i != 0; --i)
	{
		FFTsquareGFN(z, n1, n2);
		bt = Na[i / (8 * sizeof(UInt32))] >> (i % (8 * sizeof(UInt32))) & 1;
		FFTnextStepGFN(z, b, (bt == 0) ? t1 : t2, n1, n2);
	}
	FFTsquareGFN(z, n1, n2);

	myFree(Na);

	Ra = (Int32 *)myAlloc(m * sizeof(UInt32));
	FFTgetResult(z, b, Ra, n1, n2, &maxError);
	myFree(z);
	FFTfree(n2);

	r = isOne(Ra, m, b);
	myFree(Ra);

	if (r == 1)
	{
		sprintf(str, "%d^%u+1 is a probable prime. (err = %.2e)\n", b, m, maxError);
	}
	else
	{
		if (2*log((double)b)/log(2.0) + ilog(m) + log((double)ilog(m))/log(2.0) < 53)
			sprintf(str, "%d^%u+1 is composite (err = %.2e).\n", b, m, maxError);
		else
			sprintf(str, "%d^%u+1 is a probable composite (err = %.2e).\n", b, m, maxError);
	}
	printf(str);
	fp = fopen("genefer.log", "a");
	if (fp == NULL)
	{
		fprintf(stderr, "Cannot write results to genefer.log\n");
		exit(1);
	}
	fprintf(fp, str);
	fclose(fp);

	return r;
}

static void test()
{
	check(100050, 16);
	check(100014, 32);
	check(100234, 64);
	check(100032, 128);
#if		(B_MAX == 35000000)
	check(6631340, 256);
	check(5386302, 512);
	check(4376378, 1024);
	check(3563456, 2048);
	check(2891508, 4096);
	check(2409664, 8192);
	check(1915122, 16384);
	check(1519380, 32768);
	check(1266062, 65536);
#else
	check(5684328, 256);
	check(4619000, 512);
	check(3752220, 1024);
	check(3066672, 2048);
	check(2485064, 4096);
	check(2030234, 8192);
	check(1651902, 16384);
	check(1277444, 32768);
	check(857678, 65536);
#endif
}

static double benchGFN(CONST Int32 b, CONST UInt32 n, CONST unsigned int m, float * CONST maxFFTerror)
{
	unsigned int i, n1, n2;
	clock_t clock0;
	double et;
	Complex * z;
	Int32 * Ra;
	CONST double t1 = 2.0 / n, t2 = 2 * t1;

	z = FFTinitGFN(n, (double)(b - 2), &n1, &n2);

	for (i = 0; i <= ilog(n); ++i)
	{
		FFTsquareGFN(z, n1, n2);
		FFTnextStepGFN(z, b, t1, n1, n2);
	}

	clock0 = clock();
	for (i = 0; i < m; ++i)
	{
		FFTsquareGFN(z, n1, n2);
		FFTnextStepGFN(z, b, (i % 2 == 0) ? t1 : t2, n1, n2);
	}
	et = (clock() - clock0) / (double)CLOCKS_PER_SEC;

	FFTsquareGFN(z, n1, n2);
	Ra = (Int32 *)myAlloc(n * sizeof(UInt32));
	FFTgetResult(z, b, Ra, n1, n2, maxFFTerror);

	myFree(z);
	FFTfree(n2);
	myFree(Ra);

	return et;
}

static void runBench()
{
	unsigned int i, m, dgts;
	Int32 b;
	UInt32 n;
	double et, etn;
	float maxError;
	FILE * fp;

	fp = fopen("genefer.bench", "w");
	printf("Generalized Fermat Number Bench\n");
	if (fp != NULL)
		fprintf(fp, "Generalized Fermat Number Bench\n");

	for (i = 0; i < 14; ++i)
	{
		n = 1 << (i + 8);
		m = 1 << (16 - i);
		b = (Int32)(B_MAX / pow(n, 0.3)) & 0xfffffffe;
		et = benchGFN(b, n, m, &maxError);
		etn = et / m;
		dgts = (unsigned int)(n * log((double)b) / log(10.0)) + 1;
		if (etn >= 1)
		{
			printf("%d^%u+1\tTime: %.03g s/mul.\tErr: %.2e\t%u digits\n", b, n, etn, maxError, dgts);
			if (fp != NULL)
				fprintf(fp, "%d^%u+1\tTime: %.03g s/mul.\tErr: %.2e\t%u digits\n", b, n, etn, maxError, dgts);
		}
		else if (etn >= 1e-3)
		{
			printf("%d^%u+1\tTime: %.03g ms/mul.\tErr: %.2e\t%u digits\n", b, n, etn * 1e3, maxError, dgts);
			if (fp != NULL)
				fprintf(fp, "%d^%u+1\tTime: %.03g ms/mul.\tErr: %.2e\t%u digits\n", b, n, etn * 1e3, maxError, dgts);
		}
		else
		{
			printf("%d^%u+1\tTime: %.03g us/mul.\tErr: %.2e\t%u digits\n", b, n, etn * 1e6, maxError, dgts);
			if (fp != NULL)
				fprintf(fp, "%d^%u+1\tTime: %.03g us/mul.\tErr: %.2e\t%u digits\n", b, n, etn * 1e6, maxError, dgts);
		}
	}

	if (fp != NULL)
		fclose(fp);
}

int main(int argc, char * argv[])
{
	int r, i, currentLine;
	UInt32 N;
	Int32 b;
	FILE * fp, * fpi;
	char str[132];

#if	(defined(_MSC_VER) && defined(_M_IX86))
	_control87(_PC_64, _MCW_PC);
	_control87(_RC_NEAR, _MCW_RC);
#endif

	printf("GeneFer 1.3 (%s)  Copyright (C) 2001-2002, Yves GALLOT\n", VERSION);
	printf("A program for finding large probable generalized Fermat primes.\n\n");
	printf("Usage: GeneFer -b            run bench\n");
	printf("       GeneFer -t            run test\n");
	printf("       GeneFer <filename>    test <filename>\n");
	printf("       GeneFer               use interactive mode\n\n");

	if (argc == 1)
	{
		printf("  1. bench\n  2. test\n  3. normal\n");

		fgets(str, 80, stdin);
		r = atoi(str);
		if (r == 1)
		{
			runBench();
			return 0;
		}
		if (r == 2)
		{
			test();
			return 0;
		}

		printf("N: ");
		fgets(str, 80, stdin);
		N = lpt(atoi(str));
		if (N < 16) N = 16;

		printf("b: ");
		fgets(str, 80, stdin);
		b = atoi(str) & 0xfffffffe;

		check(b, N);
		return 0;
	}

	if (strcmp(argv[1], "-b") == 0)
	{
		runBench();
		return 0;
	}
	if (strcmp(argv[1], "-t") == 0)
	{
		test();
		return 0;
	}

	fp = fopen(argv[1], "r");
	if (fp == NULL)
	{
		fprintf(stderr, "Cannot open '%s'\n", argv[1]);
		return 1;
	}

	fpi = fopen("genefer.ini", "r");
	if (fpi != NULL)
	{
		fgets(str, 132, fpi);
		currentLine = atoi(str);
		fclose(fpi);
		printf("Continue test of file '%s' at line %d.\n", argv[1], currentLine);
	}
	else
	{
		currentLine = 0;
		printf("Start test of file '%s'.\n", argv[1]);
	}

	for (i = 0; i < currentLine; ++i)
		fgets(str, 132, fp);

	while (fgets(str, 132, fp) != NULL)
	{
		if (sscanf(str, "%u%u", &b, &N) == 2)
		{
			if ((b != 0) && (b <= 0x7fffffff) && (N >= 16))
			{
				check(b, N);
			}
		}
		++currentLine;
		fpi = fopen("genefer.ini", "w");
		if (fpi != NULL)
		{
			fprintf(fpi, "%d\n", currentLine);
			fclose(fpi);
		}
	}

	fclose(fp);
	return 0;
}
