/*
This file is part of mfaktc.
Copyright (C) 2009, 2010, 2011, 2012  Oliver Weihe (o.weihe@t-online.de)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>  

#include "params.h"
#include "my_types.h"
#include "compatibility.h"
#include "my_intrinsics.h"

#define NVCC_EXTERN
#include "timer.h"
#include "output.h"
#undef NVCC_EXTERN


// Inline to find the highest set bit in a word
// If no bit is set, CC 2.x returns 32, CC 1.x returns 31

__device__ static unsigned int ___clz (unsigned int a)
{
#if (__CUDA_ARCH__ >= FERMI) /* clz (count leading zeroes) is not available on CC 1.x devices */
	unsigned int r;
	asm("clz.b32 %0, %1;" : "=r" (r) : "r" (a));
	return r;
#else
	unsigned int r = 0;
	if ((a & 0xFFFF0000) == 0) r = 16, a <<= 16;
	if ((a & 0xFF000000) == 0) r += 8, a <<= 8;
	if ((a & 0xF0000000) == 0) r += 4, a <<= 4;
	if ((a & 0xC0000000) == 0) r += 2, a <<= 2;
	if ((a & 0x80000000) == 0) r += 1;
	return r;
#endif
}

// Inline to count the number of set bits in a word

__device__ static unsigned int ___popcnt (unsigned int a)
{
#if (__CUDA_ARCH__ >= FERMI) /* popc (population count) is not available on CC 1.x devices */
	unsigned int r;
	asm("popc.b32 %0, %1;" : "=r" (r) : "r" (a));
	return r;
#else
	a = (a&0x55555555) + ((a>> 1)&0x55555555);  // Generate sixteen 2-bit sums
	a = (a&0x33333333) + ((a>> 2)&0x33333333);  // Generate eight 3-bit sums
	a = (a&0x07070707) + ((a>> 4)&0x07070707);  // Generate four 4-bit sums
	a = (a&0x000F000F) + ((a>> 8)&0x000F000F);  // Generate two 5-bit sums
	a = (a&0x0000001F) + ((a>>16)&0x0000001F);  // Generate one 6-bit sum
	return a;
#endif
}

/* For some reason the nVidia compiler (at least version 4) generates better code */
/* for 2.1 devices for some kernels using 3 blocks and some kernels with 4 blocks */

#undef KERNEL_MIN_BLOCKS
#define KERNEL_MIN_BLOCKS 3
#define KERNEL_MAX_BLOCKS 4

#include "tf_192.h"
#include "tf_224.h"
#include "tf_256.h"
#include "tf_160.h"
#include "tf_128.h"
#include "tf_96.h"
#include "tf_m127.h"
#include "tf_m107.h"
#include "tf_m89.h"
#include "tf_m61.h"
#include "tf_m31.h"
#include "tf_f0_31.h"
#include "tf_f32_63.h"
#include "tf_f64_95.h"
#include "tf_f96_127.h"
#include "tf_f128_159.h"
#include "tf_f160_191.h"
#include "tf_f192_223.h"

#include "tf_common_gs.cu"
