/*
This file is part of mfaktc.
Copyright (C) 2009, 2010, 2011, 2012  Oliver Weihe (o.weihe@t-online.de)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
                                
You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <stdlib.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <string.h>
#include <errno.h> 

#include <cuda.h>
#include <cuda_runtime.h>  

#include "params.h"
#include "my_types.h"
#include "compatibility.h"

#include "read_config.h"
#include "parse.h"
#include "timer.h"
#include "checkpoint.h"
#include "signal_handler.h"
#include "output.h"

#ifdef _MSC_VER
extern "C" int tf_class_barrett92_gs(unsigned long long int k_min, unsigned long long int k_max, mystuff_t *mystuff);
#else
extern int tf_class_barrett92_gs(unsigned long long int k_min, unsigned long long int k_max, mystuff_t *mystuff);
#endif


unsigned long long int calculate_k_min(int fermat, unsigned int exp, int bits)
/* calculates biggest possible Mersenne k in "2 * k * exp + 1 < 2^bits" or Fermat k in "k * 2^exp + 1 < 2^bits" */
{
  if (fermat) {
	  // solve for k in: 2^bits = k*2^exp+1
	  // k = 2^(bits-exp)
	  if (bits - exp == 64) return (0ULL - 1ULL);
	  return (1ULL << (bits - exp));
  }
  if (exp == 31) {
	  unsigned long long k;
	  // solve for k in: 2^bits = 2*k*M31+1
	  // k = 2^(bits-32) + 2^(bits-63) + 2^(bits-94)
	  k = 1ULL << (bits - 32);
	  if (bits >= 63) k += 1ULL << (bits - 63);
	  if (bits >= 94) k += 1ULL << (bits - 94);
	  return (k);
  }
  if (exp == 61) {
	  unsigned long long k;
	  // solve for k in: 2^bits = 2*k*M61+1
	  // k = 2^(bits-62) + 2^(bits-123)
	  k = 1ULL << (bits - 62);
	  if (bits >= 123) k += 1ULL << (bits - 123);
	  return (k);
  }
  if (exp == 89) {
	  // solve for k in: 2^bits = 2*k*M89+1
	  // k = 2^(bits-90)
	  return (1ULL << (bits - 90));
  }
  if (exp == 107) {
	  // solve for k in: 2^bits = 2*k*M107+1
	  // k = 2^(bits-108)
	  return (1ULL << (bits - 108));
  }
  if (exp == 127) {
	  // solve for k in: 2^bits = 2*k*M127+1
	  // k = 2^(bits-128)
	  return (1ULL << (bits - 128));
  }
  // Can't happen:
  printf("Bad calculate_k_min call\n");
  exit(1);
}

unsigned long long int calculate_k_max(int fermat, unsigned int exp, int bits)
/* calculates biggest possible Mersenne k in "2 * k * exp + 1 < 2^bits" or Fermat k in "k * 2^exp + 1 < 2^bits" */
{
  if (fermat) {
	  // solve for k in: 2^bits = k*2^exp+1
	  // k = 2^(bits-exp)-1
	  if (bits - exp == 64) return (0ULL - 1ULL);
	  return ((1ULL << (bits - exp)) - 1);
  }
  if (exp == 31) {
	  unsigned long long k;
	  // solve for k in: 2^bits = 2*k*M31+1
	  // k = 2^(bits-32) + 2^(bits-63) + 2^(bits-94) - 1
	  k = 1ULL << (bits - 32);
	  if (bits >= 63) k += 1ULL << (bits - 63);
	  if (bits >= 94) k += 1ULL << (bits - 94);
	  return (k-1);
  }
  if (exp == 61) {
	  unsigned long long k;
	  // solve for k in: 2^bits = 2*k*M61+1
	  // k = 2^(bits-62) + 2^(bits-123) - 1
	  k = 1ULL << (bits - 62);
	  if (bits >= 123) k += 1ULL << (bits - 123);
	  return (k-1);
  }
  if (exp == 89) {
	  // solve for k in: 2^bits = 2*k*M89+1
	  // k = 2^(bits-90)-1
	  return ((1ULL << (bits - 90)) - 1);
  }
  if (exp == 107) {
	  // solve for k in: 2^bits = 2*k*M107+1
	  // k = 2^(bits-108)-1
	  return ((1ULL << (bits - 108)) - 1);
  }
  if (exp == 127) {
	  // solve for k in: 2^bits = 2*k*M127+1
	  // k = 2^(bits-128)-1
	  return ((1ULL << (bits - 128)) - 1);
  }
  // Can't happen:
  printf("Bad calculate_k_max call\n");
  exit(1);
}


int class_needed(int fermat, unsigned int exp, unsigned long long int k_min, int c)
{
/*
checks whether the class c must be processed or can be ignored at all because
all factor candidates within the class c are a multiple of 3, 5, 7 or 11 (11
only if MORE_CLASSES is defined) or are 3 or 5 mod 8 (Mersenne numbers only)
*/
  if (fermat) {					// Ignore even k values, do those factor candidates using a larger exponent
	unsigned int exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod3 = (1 << (exp % 2)) % 3;
	exp_mod5 = (1 << (exp % 4)) % 5;
	exp_mod7 = (1 << (exp % 6)) % 7;
	exp_mod11 = (1 << (exp % 10)) % 11;
	if( (k_min + c) & 1 && \
	    ((exp_mod3 * ((k_min + c) %  3) + 1) %  3 != 0) && \
	    ((exp_mod5 * ((k_min + c) %  5) + 1) %  5 != 0) && \
	    ((exp_mod7 * ((k_min + c) %  7) + 1) %  7 != 0))
#ifdef MORE_CLASSES
  	if( (exp_mod11 * ((k_min + c) % 11) + 1) % 11 != 0 )
#endif
	{
		return 1;
	}
	return 0;
  }
  if (exp == 31) {
	unsigned int exp_mod8, exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod8 = 7; exp_mod3 = 1; exp_mod5 = 2; exp_mod7 = 1; exp_mod11 = 1;
	if( ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  2) && \
	    ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  4) && \
	    ((2 * (exp_mod3) * ((k_min + c) %  3)) %  3 !=  2) && \
	    ((2 * (exp_mod5) * ((k_min + c) %  5)) %  5 !=  4) && \
	    ((2 * (exp_mod7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES
  	if(  (2 * (exp_mod11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
	{
		return 1;
	}
	return 0;
  }
  if (exp == 61) {
	unsigned int exp_mod8, exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod8 = 7; exp_mod3 = 1; exp_mod5 = 1; exp_mod7 = 1; exp_mod11 = 1;
	if( ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  2) && \
	    ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  4) && \
	    ((2 * (exp_mod3) * ((k_min + c) %  3)) %  3 !=  2) && \
	    ((2 * (exp_mod5) * ((k_min + c) %  5)) %  5 !=  4) && \
	    ((2 * (exp_mod7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES        
  	if(  (2 * (exp_mod11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
	{
		return 1;
	}
	return 0;
  }
  if (exp == 89) {
	unsigned int exp_mod8, exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod8 = 7; exp_mod3 = 1; exp_mod5 = 1; exp_mod7 = 3; exp_mod11 = 5;
	if( ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  2) && \
	    ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  4) && \
	    ((2 * (exp_mod3) * ((k_min + c) %  3)) %  3 !=  2) && \
	    ((2 * (exp_mod5) * ((k_min + c) %  5)) %  5 !=  4) && \
	    ((2 * (exp_mod7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES        
  	if(  (2 * (exp_mod11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
	{
		return 1;
	}
	return 0;
  }
  if (exp == 107) {
	unsigned int exp_mod8, exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod8 = 7; exp_mod3 = 1; exp_mod5 = 2; exp_mod7 = 3; exp_mod11 = 6;
	if( ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  2) && \
	    ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  4) && \
	    ((2 * (exp_mod3) * ((k_min + c) %  3)) %  3 !=  2) && \
	    ((2 * (exp_mod5) * ((k_min + c) %  5)) %  5 !=  4) && \
	    ((2 * (exp_mod7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES        
  	if(  (2 * (exp_mod11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
	{
		return 1;
	}
	return 0;
  }
  if (exp == 127) {
	unsigned int exp_mod8, exp_mod3, exp_mod5, exp_mod7, exp_mod11;
	exp_mod8 = 7; exp_mod3 = 1; exp_mod5 = 2; exp_mod7 = 1; exp_mod11 = 6;
	if( ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  2) && \
	    ((2 * (exp_mod8) * ((k_min + c) %  8)) %  8 !=  4) && \
	    ((2 * (exp_mod3) * ((k_min + c) %  3)) %  3 !=  2) && \
	    ((2 * (exp_mod5) * ((k_min + c) %  5)) %  5 !=  4) && \
	    ((2 * (exp_mod7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES        
  	if(  (2 * (exp_mod11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
	{
		return 1;
	}
	return 0;
  }

  if( ((2 * (exp %  8) * ((k_min + c) %  8)) %  8 !=  2) && \
      ((2 * (exp %  8) * ((k_min + c) %  8)) %  8 !=  4) && \
      ((2 * (exp %  3) * ((k_min + c) %  3)) %  3 !=  2) && \
      ((2 * (exp %  5) * ((k_min + c) %  5)) %  5 !=  4) && \
      ((2 * (exp %  7) * ((k_min + c) %  7)) %  7 !=  6))
#ifdef MORE_CLASSES        
  if(  (2 * (exp % 11) * ((k_min + c) % 11)) % 11 != 10 )
#endif
  {
    return 1;
  }

  return 0;
}





int tf(mystuff_t *mystuff, int kernel)
/*
tf M<mystuff->exponent> from 2^<mystuff->bit_min> to 2^<mystuff->mystuff->bit_max_stage>

kernel: see my_types.h -> enum GPUKernels

return value (mystuff->mode = MODE_NORMAL):
number of factors found
RET_CUDA_ERROR cudaGetLastError() returned an error
RET_QUIT if early exit was requested by SIGINT

return value (mystuff->mode = MODE_SELFTEST_SHORT or MODE_SELFTEST_FULL):
0 for a successfull selftest (known factor was found)
1 no factor found
2 wrong factor returned
RET_CUDA_ERROR cudaGetLastError() returned an error

other return value 
-1 unknown mode
*/
{
  int cur_class, max_class = NUM_CLASSES-1, i;
  unsigned long long int k_min, k_max;
  struct timeval timer, timer_last_checkpoint;
  int factorsfound = 0, numfactors = 0, restart = 0;

  int retval = 0;
  
  cudaError_t cudaError;
  
  unsigned long long int time_run, time_est;
  
  mystuff->stats.output_counter = 0; /* reset output counter, needed for status headline */
//  mystuff->stats.ghzdays = primenet_ghzdays(mystuff->exponent, mystuff->bit_min, mystuff->bit_max_stage);

  if(mystuff->mode != MODE_SELFTEST_SHORT) {
    if (mystuff->fermat_factoring || mystuff->exponent <= 127) {
      unsigned long long k_min, k_max;
      char krange[100];
      k_min=calculate_k_min(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_min);
      k_max=calculate_k_max(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_max_stage);
      if (mystuff->k_lower_bound > k_min) k_min = mystuff->k_lower_bound;
      if (mystuff->k_upper_bound && mystuff->k_upper_bound < k_max) k_max = mystuff->k_upper_bound;
      if (k_min % 1000000000000000ULL == 0 && k_max % 1000000000000000ULL == 0)
        sprintf(krange, "k range: %" PRIu64 "P to %" PRIu64 "P", k_min / 1000000000000000ULL, k_max / 1000000000000000ULL);
      else if (k_min % 1000000000000ULL == 0 && k_max % 1000000000000ULL == 0)
        sprintf(krange, "k range: %" PRIu64 "T to %" PRIu64 "T", k_min / 1000000000000ULL, k_max / 1000000000000ULL);
      else if (k_min % 1000000000ULL == 0 && k_max % 1000000000ULL == 0)
        sprintf(krange, "k range: %" PRIu64 "G to %" PRIu64 "G", k_min / 1000000000ULL, k_max / 1000000000ULL);
      else if (k_min % 1000000ULL == 0 && k_max % 1000000ULL == 0)
        sprintf(krange, "k range: %" PRIu64 "M to %" PRIu64 "M", k_min / 1000000ULL, k_max / 1000000ULL);
      else
        sprintf(krange, "k range: %" PRIu64 " to %" PRIu64 "", k_min, k_max);
      sprintf(krange+strlen(krange), " (%d-bit factors)", mystuff->bit_max_stage);
      printf("Starting trial factoring of %s in %s\n", mystuff->exponent_string, krange);
    }
//    else
//      printf("Starting trial factoring %s from 2^%d to 2^%d (%.2f GHz-days)\n", mystuff->exponent_string, mystuff->bit_min, mystuff->bit_max_stage, mystuff->stats.ghzdays);
  }
  if((mystuff->mode != MODE_NORMAL) && (mystuff->mode != MODE_SELFTEST_SHORT) && (mystuff->mode != MODE_SELFTEST_FULL))
  {
    printf("ERROR, invalid mode for tf(): %d\n", mystuff->mode);
    return -1;
  }
  timer_init(&timer);
  timer_init(&timer_last_checkpoint);
  
  mystuff->stats.class_counter = 0;

  k_min=calculate_k_min(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_min);
  k_max=calculate_k_max(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_max_stage);

  if (mystuff->k_lower_bound && k_min < mystuff->k_lower_bound) k_min = mystuff->k_lower_bound;
  if (mystuff->k_upper_bound && k_max > mystuff->k_upper_bound) k_max = mystuff->k_upper_bound;

// Comment this out as testing k's that are too small can cause sporadic exponentiation failures in tf_validate.h.
// I think only self-testing required k_min to be 0 mod NUM_CLASSES */
//  k_min -= k_min % NUM_CLASSES;	/* k_min is now 0 mod NUM_CLASSES */

  if(mystuff->mode != MODE_SELFTEST_SHORT && mystuff->verbosity >= 1)
  {
    printf(" k_min = %" PRIu64 "\n",k_min);
    printf(" k_max = %" PRIu64 "\n",k_max);
  }

  if(kernel == AUTOSELECT_KERNEL)
  {
    kernel = BARRETT92_MUL32_GS;
  }

  if(mystuff->fermat_factoring) {
    char *kname;
    if (mystuff->exponent <= 31) {
	    if (mystuff->bit_max_stage <= 89) kname = "mfaktc_barrett89_F0_31gs";
	    else if (mystuff->bit_max_stage <= 96) kname = "mfaktc_barrett96_F0_31gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 63) {
	    if (mystuff->bit_max_stage <= 89) kname = "mfaktc_barrett89_F32_63gs";
	    else if (mystuff->bit_max_stage <= 96) kname = "mfaktc_barrett96_F32_63gs";
	    else if (mystuff->bit_max_stage <= 108) kname = "mfaktc_barrett108_F32_63gs";
	    else if (mystuff->bit_max_stage <= 120) kname = "mfaktc_barrett120_F32_63gs";
	    else if (mystuff->bit_max_stage <= 128) kname = "mfaktc_barrett128_F32_63gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 95) {
	    if (mystuff->bit_max_stage <= 108) kname = "mfaktc_barrett108_F64_95gs";
	    else if (mystuff->bit_max_stage <= 120) kname = "mfaktc_barrett120_F64_95gs";
	    else if (mystuff->bit_max_stage <= 128) kname = "mfaktc_barrett128_F64_95gs";
	    else if (mystuff->bit_max_stage <= 140) kname = "mfaktc_barrett140_F64_95gs";
	    else if (mystuff->bit_max_stage <= 152) kname = "mfaktc_barrett152_F64_95gs";
	    else if (mystuff->bit_max_stage <= 160) kname = "mfaktc_barrett160_F64_95gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 127) {
	    if (mystuff->bit_max_stage <= 140) kname = "mfaktc_barrett140_F96_127gs";
	    else if (mystuff->bit_max_stage <= 152) kname = "mfaktc_barrett152_F96_127gs";
	    else if (mystuff->bit_max_stage <= 160) kname = "mfaktc_barrett160_F96_127gs";
	    else if (mystuff->bit_max_stage <= 172) kname = "mfaktc_barrett172_F96_127gs";
	    else if (mystuff->bit_max_stage <= 183) kname = "mfaktc_barrett183_F96_127gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 159) {
	    if (mystuff->bit_max_stage <= 172) kname = "mfaktc_barrett172_F128_159gs";
	    else if (mystuff->bit_max_stage <= 183) kname = "mfaktc_barrett183_F128_159gs";
	    else if (mystuff->bit_max_stage <= 185) kname = "mfaktc_barrett185_F128_159gs";
	    else if (mystuff->bit_max_stage <= 188) kname = "mfaktc_barrett188_F128_159gs";
	    else if (mystuff->bit_max_stage <= 192) kname = "mfaktc_barrett192_F128_159gs";
	    else if (mystuff->bit_max_stage <= 204) kname = "mfaktc_barrett204_F128_159gs";
	    else if (mystuff->bit_max_stage <= 215) kname = "mfaktc_barrett215_F128_159gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 191) {
	    if (mystuff->bit_max_stage <= 204) kname = "mfaktc_barrett204_F160_191gs";
	    else if (mystuff->bit_max_stage <= 215) kname = "mfaktc_barrett215_F160_191gs";
	    else if (mystuff->bit_max_stage <= 217) kname = "mfaktc_barrett217_F160_191gs";
	    else if (mystuff->bit_max_stage <= 220) kname = "mfaktc_barrett220_F160_191gs";
	    else if (mystuff->bit_max_stage <= 224) kname = "mfaktc_barrett224_F160_191gs";
	    else if (mystuff->bit_max_stage <= 236) kname = "mfaktc_barrett236_F160_191gs";
	    else if (mystuff->bit_max_stage <= 247) kname = "mfaktc_barrett247_F160_191gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else if (mystuff->exponent <= 223) {
	    if (mystuff->bit_max_stage <= 236) kname = "mfaktc_barrett236_F192_223gs";
	    else if (mystuff->bit_max_stage <= 247) kname = "mfaktc_barrett247_F192_223gs";
	    else if (mystuff->bit_max_stage <= 249) kname = "mfaktc_barrett249_F192_223gs";
	    else if (mystuff->bit_max_stage <= 252) kname = "mfaktc_barrett252_F192_223gs";
	    else printf ("No kernel for that bit level\n"), exit (1);
    }
    else
	    printf ("No kernel for that Fermat number\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  }
  else if (mystuff->exponent == 31) {
    char *kname;
    if (mystuff->bit_max_stage <= 89) kname = "mfaktc_barrett89_M31gs";
    else if (mystuff->bit_max_stage <= 96) kname = "mfaktc_barrett96_M31gs";
    else printf ("No kernel for that bit level\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  } else if (mystuff->exponent == 61) {
    char *kname;
    if (mystuff->bit_max_stage <= 108) kname = "mfaktc_barrett108_M61gs";
    else if (mystuff->bit_max_stage <= 120) kname = "mfaktc_barrett120_M61gs";
    else if (mystuff->bit_max_stage <= 125) kname = "mfaktc_barrett128_M61gs";
    else printf ("No kernel for that bit level\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  } else if (mystuff->exponent == 89) {
    char *kname;
    if (mystuff->bit_max_stage <= 128) kname = "mfaktc_barrett128_M89gs";
    else if (mystuff->bit_max_stage <= 140) kname = "mfaktc_barrett140_M89gs";
    else if (mystuff->bit_max_stage <= 152) kname = "mfaktc_barrett152_M89gs";
    else if (mystuff->bit_max_stage <= 153) kname = "mfaktc_barrett160_M89gs";
    else printf ("No kernel for that bit level\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  } else if (mystuff->exponent == 107) {
    char *kname;
    if (mystuff->bit_max_stage <= 152) kname = "mfaktc_barrett152_M107gs";
    else if (mystuff->bit_max_stage <= 160) kname = "mfaktc_barrett160_M107gs";
    else if (mystuff->bit_max_stage <= 172) kname = "mfaktc_barrett172_M107gs";
    else printf ("No kernel for that bit level\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  } else if (mystuff->exponent == 127) {
    char *kname;
    if (mystuff->bit_max_stage <= 183) kname = "mfaktc_barrett183_M127gs";
    else if (mystuff->bit_max_stage <= 185) kname = "mfaktc_barrett185_M127gs";
    else if (mystuff->bit_max_stage <= 188) kname = "mfaktc_barrett188_M127gs";
    else printf ("No kernel for that bit level\n"), exit (1);
    strcpy(mystuff->stats.kernelname, kname);
  } else
    sprintf(mystuff->stats.kernelname, "UNKNOWN kernel");

  if(mystuff->mode != MODE_SELFTEST_SHORT && mystuff->verbosity >= 1)printf("Using GPU kernel \"%s\"\n", mystuff->stats.kernelname);

  if(mystuff->mode == MODE_NORMAL)
  {
    if((mystuff->checkpoints == 1) && (checkpoint_read(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_min, mystuff->bit_max_stage, mystuff->k_lower_bound, mystuff->k_upper_bound, mystuff->dont_checksum, &cur_class, &factorsfound) == 1))
    {
      printf("\nfound a valid checkpoint file!\n");
      if(mystuff->verbosity >= 1)printf("  last finished class was: %d\n", cur_class);
      if(mystuff->verbosity >= 1)printf("  found %d factor(s) already\n\n", factorsfound);
      else                          printf("\n");
      cur_class++; // the checkpoint contains the last complete processed class!

/* calculate the number of classes which are already processed. This value is needed to estimate ETA */
      for(i = 0; i < cur_class; i++)
      {
        if(class_needed(mystuff->fermat_factoring, mystuff->exponent, k_min, i))mystuff->stats.class_counter++;
      }
      restart = mystuff->stats.class_counter;
    }
    else
    {
      cur_class=0;
    }
  }

  for(; cur_class <= max_class; cur_class++)
  {
    if(class_needed(mystuff->fermat_factoring, mystuff->exponent, k_min, cur_class))
    {
      mystuff->stats.class_number = cur_class;
      if(mystuff->quit)
      {
/* check if quit is requested. Because this is at the begining of the class
   we can be sure that if RET_QUIT is returned the last class hasn't
   finished. The signal handler which sets mystuff->quit not active during
   selftests so we need to check for RET_QUIT only when doing real work. */
        if(mystuff->printmode == 1)printf("\n");
        return RET_QUIT;
      }
      else
      {
        mystuff->stats.class_counter++;
      
	if(kernel == BARRETT92_MUL32_GS) numfactors = tf_class_barrett92_gs(k_min+cur_class, k_max, mystuff);
        else
        {
          printf("ERROR: Unknown kernel selected (%d)!\n", kernel);
          return RET_CUDA_ERROR;
        }
        cudaError = cudaGetLastError();
        if(cudaError != cudaSuccess)
        {
          printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
          return RET_CUDA_ERROR; /* bail out, we might have a serios problem (detected by cudaGetLastError())... */
	}

	if (mystuff->verbosity == 88 && cur_class > 10) break;

        factorsfound += numfactors;
        if(mystuff->mode == MODE_NORMAL)
        {
          if(mystuff->checkpoints == 1)
          {
            if(numfactors > 0 || timer_diff(&timer_last_checkpoint)/1000000 > (unsigned long long int)mystuff->checkpointdelay || mystuff->quit)
            {
              timer_init(&timer_last_checkpoint);
              checkpoint_write(mystuff->fermat_factoring, mystuff->exponent, mystuff->bit_min, mystuff->bit_max_stage, mystuff->k_lower_bound, mystuff->k_upper_bound, cur_class, factorsfound);
            }
          }
          if((mystuff->stopafterfactor >= 2) && (factorsfound > 0) && (cur_class != max_class))cur_class = max_class + 1;
        }
      }
      fflush(NULL);
    }
  }
  if(mystuff->mode != MODE_SELFTEST_SHORT && mystuff->printmode == 1)printf("\n");
  print_result_line(mystuff, factorsfound);

  if(mystuff->mode == MODE_NORMAL)
  {
    retval = factorsfound;
    if(mystuff->checkpoints == 1)checkpoint_delete(mystuff->fermat_factoring, mystuff->exponent);
  }

  if(mystuff->mode != MODE_SELFTEST_SHORT)
  {
    time_run = timer_diff(&timer)/1000;

    if(restart == 0)printf("tf(): total time spent: ");
    else            printf("tf(): time spent since restart:   ");

/*  restart == 0 ==> time_est = time_run */
#ifndef MORE_CLASSES
    time_est = (time_run * 96ULL  ) / (unsigned long long int)(96 -restart);
#else
    time_est = (time_run * 960ULL ) / (unsigned long long int)(960-restart);
#endif

    if(time_est > 86400000ULL)printf("%" PRIu64 "d ",   time_run / 86400000ULL);
    if(time_est > 3600000ULL) printf("%2" PRIu64 "h ", (time_run /  3600000ULL) % 24ULL);
    if(time_est > 60000ULL)   printf("%2" PRIu64 "m ", (time_run /    60000ULL) % 60ULL);
                              printf("%2" PRIu64 ".%03" PRIu64 "s\n", (time_run / 1000ULL) % 60ULL, time_run % 1000ULL);
    if(restart != 0)
    {
      printf("      estimated total time spent: ");
      if(time_est > 86400000ULL)printf("%" PRIu64 "d ",   time_est / 86400000ULL);
      if(time_est > 3600000ULL) printf("%2" PRIu64 "h ", (time_est /  3600000ULL) % 24ULL);
      if(time_est > 60000ULL)   printf("%2" PRIu64 "m ", (time_est /    60000ULL) % 60ULL);
                                printf("%2" PRIu64 ".%03" PRIu64 "s\n", (time_est / 1000ULL) % 60ULL, time_est % 1000ULL);
    }
    printf("\n");
  }
  return retval;
}


void print_last_CUDA_error()
/* just run cudaGetLastError() and print the error message if its return value is not cudaSuccess */
{
  cudaError_t cudaError;
  
  cudaError = cudaGetLastError();
  if(cudaError != cudaSuccess)
  {
    printf("  cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
  }
}


int main(int argc, char **argv)
{
  int parse_ret = -1;
  int devicenumber = 0;
  mystuff_t mystuff;
  struct cudaDeviceProp deviceinfo;
  int i, tmp = 0;
  char *ptr;
  
  i = 1;
  mystuff.mode = MODE_NORMAL;
  mystuff.quit = 0;
  mystuff.verbosity = 1;
  mystuff.bit_min = -1;
  mystuff.bit_max_assignment = -1;
  mystuff.bit_max_stage = -1;
  mystuff.dont_checksum = 0;
  mystuff.fermat_factoring = 0;
  mystuff.gpu_sieving = 1;
  mystuff.gpu_sieve_size = 128 * 1024 * 1024;	/* Size (in bits) of the GPU sieve.  Default is 128M bits. */
  mystuff.gpu_sieve_primes = 82486;		/* Default to sieving primes below about 1.05M */
  mystuff.gpu_sieve_processing_size = 2048;	/* Default to 2K bits processed by each thread in a Barrett kernel. */
  mystuff.gpu_sieve_primes_auto = 1;		/* Default to different sieve primes for different kernels */
  sprintf(mystuff.resultfile, "results.txt");

  // Clear pointers for cudaFree's benefit in case these are never allocated
  mystuff.d_RES = NULL;
  mystuff.h_RES = NULL;
  mystuff.d_bitarray = NULL;
  mystuff.d_sieve_info = NULL;
  mystuff.d_calc_bit_to_clear_info = NULL;

  while(i < argc)
  {
    if(!strcmp((char*)"-h", argv[i]))
    {
      print_help(argv[0]);
      return 0;
    }
    else if(!strcmp((char*)"-d", argv[i]))
    {
      if(i+1 >= argc)
      {
        printf("ERROR: no device number specified for option \"-d\"\n");
        return 1;
      }
      devicenumber = (int)strtol(argv[i+1], &ptr, 10);
      if(*ptr || errno || devicenumber != strtol(argv[i+1], &ptr, 10) )
      {
        printf("ERROR: can't parse <device number> for option \"-d\"\n");
        return 1;
      }
      i++;
    }
    else if(!strcmp((char*)"-nocheck", argv[i]))
    {
      mystuff.dont_checksum = 1;
    }
    else if(!strcmp((char*)"-gs", argv[i]))
    {
      mystuff.gpu_sieving = 1;
    }
//    else if(!strcmp((char*)"-st", argv[i]))
//    {
//      mystuff.mode = MODE_SELFTEST_FULL;
//      mystuff.selftestsize = 1;
//    }
//    else if(!strcmp((char*)"-st2", argv[i]))
//    {
//      mystuff.mode = MODE_SELFTEST_FULL;
//      mystuff.selftestsize = 2;
//    }
    else if(!strcmp((char*)"--timertest", argv[i]))
    {
      timertest();
      return 0;
    }
//    else if(!strcmp((char*)"--sleeptest", argv[i]))
//    {
//      sleeptest();
//      return 0;
//    }
    else if(!strcmp((char*)"-v", argv[i]))
    {
      if(i+1 >= argc)
      {
        printf("ERROR: no verbosity level specified for option \"-v\"\n");
        return 1;
      }
      tmp = (int)strtol(argv[i+1], &ptr, 10);
      if(*ptr || errno || tmp != strtol(argv[i+1], &ptr, 10) )
      {
        printf("ERROR: can't parse verbosity level for option \"-v\"\n");
        return 1;
      }
      i++;
      
      if(tmp > 3 && tmp != 88)
      {
        printf("WARNING: maximum verbosity level is 3\n");
        tmp = 3;
      }
      
      if(tmp < 0)
      {
        printf("WARNING: minumum verbosity level is 0\n");
        tmp = 0;
      }

      mystuff.verbosity = tmp;
    }
    else
    {
      printf("ERROR: argument \"%s\" is unrecognized.\n", argv[i]);
      exit (1);
    }
    i++;
  }

  printf("mmff v%s (%dbit built)\n\n", MMFF_VERSION, (int)(sizeof(void*)*8));

/* print current configuration */
  
  if(mystuff.verbosity >= 1)printf("Compiletime options\n");
  if(mystuff.verbosity >= 1)printf("  THREADS_PER_BLOCK         %d\n", THREADS_PER_BLOCK);
//  if(mystuff.verbosity >= 1)printf("  SIEVE_SIZE_LIMIT          %dkiB\n", SIEVE_SIZE_LIMIT);
//  if(mystuff.verbosity >= 1)printf("  SIEVE_SIZE                %dbits\n", SIEVE_SIZE);
//  if(SIEVE_SIZE <= 0)
//  {
//    printf("ERROR: SIEVE_SIZE is <= 0, consider to increase SIEVE_SIZE_LIMIT in params.h\n");
//    return 1;
//  }
//  if(mystuff.verbosity >= 1)printf("  SIEVE_SPLIT               %d\n", SIEVE_SPLIT);
//  if(SIEVE_SPLIT > SIEVE_PRIMES_MIN)
//  {
//    printf("ERROR: SIEVE_SPLIT must be <= SIEVE_PRIMES_MIN\n");
//    return 1;
//  }
#ifdef MORE_CLASSES
  if(mystuff.verbosity >= 1)printf("  MORE_CLASSES              enabled\n");
#else
  if(mystuff.verbosity >= 1)printf("  MORE_CLASSES              disabled\n");
#endif

#ifdef RAW_GPU_BENCH
  if(mystuff.verbosity >= 1)printf("  RAW_GPU_BENCH             enabled (DEBUG option)\n");
#endif

  read_config(&mystuff);

  int drv_ver, rt_ver;
  if(mystuff.verbosity >= 1)printf("\nCUDA version info\n");
  if(mystuff.verbosity >= 1)printf("  binary compiled for CUDA  %d.%d\n", CUDART_VERSION/1000, CUDART_VERSION%100);
#if CUDART_VERSION >= 2020
  cudaRuntimeGetVersion(&rt_ver);
  if(mystuff.verbosity >= 1)printf("  CUDA runtime version      %d.%d\n", rt_ver/1000, rt_ver%100);
  cudaDriverGetVersion(&drv_ver);  
  if(mystuff.verbosity >= 1)printf("  CUDA driver version       %d.%d\n", drv_ver/1000, drv_ver%100);
  
  if(drv_ver < CUDART_VERSION)
  {
    printf("ERROR: current CUDA driver version is lower than the CUDA toolkit version used during compile!\n");
    printf("       Please update your graphics driver.\n");
    return 1;
  }
  if(rt_ver != CUDART_VERSION)
  {
    printf("ERROR: CUDA runtime version must match the CUDA toolkit version used during compile!\n");
    return 1;
  }
#endif  

  if(cudaSetDevice(devicenumber)!=cudaSuccess)
  {
    printf("cudaSetDevice(%d) failed\n",devicenumber);
    print_last_CUDA_error();
    return 1;
  }

  cudaGetDeviceProperties(&deviceinfo, devicenumber);
  if(mystuff.verbosity >= 1)printf("\nCUDA device info\n");
  if(mystuff.verbosity >= 1)printf("  name                      %s\n",deviceinfo.name);
  mystuff.compcapa_major = deviceinfo.major;
  mystuff.compcapa_minor = deviceinfo.minor;
  if(mystuff.verbosity >= 1)printf("  compute capability        %d.%d\n",deviceinfo.major,deviceinfo.minor);
  if(mystuff.compcapa_major == 1)
  {
    printf("Sorry, devices with compute capability 1.x are not supported!\n");
    return 1;
  }
  if(mystuff.verbosity >= 1)printf("  maximum threads per block %d\n",deviceinfo.maxThreadsPerBlock);
#if CUDART_VERSION >= 2000
  i=0;
       if(deviceinfo.major == 1)i=8;                            /* devices with compute capability 1.x have 8 shader cores per multiprocessor */
  else if(deviceinfo.major == 2 && deviceinfo.minor == 0)i=32;	/* devices with compute capability 2.0 have 32 shader cores per multiprocessor */
  else if(deviceinfo.major == 2 && deviceinfo.minor == 1)i=48;	/* devices with compute capability 2.1 have 48 shader cores per multiprocessor */
  else if(deviceinfo.major == 3 && deviceinfo.minor == 0)i=192;	/* devices with compute capability 3.0 have 192 shader cores per multiprocessor */
  if(i != 0){if(mystuff.verbosity >= 1)printf("  number of multiprocessors %d (%d shader cores)\n", deviceinfo.multiProcessorCount, deviceinfo.multiProcessorCount * i);}
  else      {if(mystuff.verbosity >= 1)printf("  number of mutliprocessors %d (unknown number of shader cores)\n", deviceinfo.multiProcessorCount);}
#endif
  if(mystuff.verbosity >= 1)printf("  clock rate                %dMHz\n", deviceinfo.clockRate / 1000);
  if(THREADS_PER_BLOCK > deviceinfo.maxThreadsPerBlock)
  {
    printf("\nERROR: THREADS_PER_BLOCK > deviceinfo.maxThreadsPerBlock\n");
    return 1;
  }

  // Don't do a CPU spin loop waiting for the GPU
  cudaSetDeviceFlags(cudaDeviceBlockingSync);

//  if(mystuff.verbosity >= 1)printf("\nAutomatic parameters\n");
//#if CUDART_VERSION >= 2000
//  i = THREADS_PER_BLOCK * deviceinfo.multiProcessorCount;
//  while( (i * 2) <= mystuff.threads_per_grid_max) i = i * 2;
//  mystuff.threads_per_grid = i;
//#else
//  mystuff.threads_per_grid = mystuff.threads_per_grid_max;
//#endif
//  if(mystuff.verbosity >= 1)printf("  threads per grid          %d\n", mystuff.threads_per_grid);
  
//  if(mystuff.threads_per_grid % THREADS_PER_BLOCK)
//  {
//    printf("ERROR: mystuff.threads_per_grid is _NOT_ a multiple of THREADS_PER_BLOCK\n");
//    return 1;
//  }
  if(mystuff.verbosity >= 1)printf("\n");
  
//  for(i=0;i<mystuff.num_streams;i++)
//  {
//    if( cudaStreamCreate(&(mystuff.stream[i])) != cudaSuccess)
//    {
//      printf("ERROR: cudaStreamCreate() failed for stream %d\n", i);
//      print_last_CUDA_error();
//      return 1;
//    }
//  }
/* Allocate some memory arrays */  
//  for(i=0;i<(mystuff.num_streams + mystuff.cpu_streams);i++)
//  {
//    if( cudaHostAlloc((void**)&(mystuff.h_ktab[i]), mystuff.threads_per_grid * sizeof(int), 0) != cudaSuccess )
//    {
//      printf("ERROR: cudaHostAlloc(h_ktab[%d]) failed\n", i);
//      print_last_CUDA_error();
//      return 1;
//    }
//  }
//  for(i=0;i<mystuff.num_streams;i++)
//  {
//    if( cudaMalloc((void**)&(mystuff.d_ktab[i]), mystuff.threads_per_grid * sizeof(int)) != cudaSuccess )
//    {
//      printf("ERROR: cudaMalloc(d_ktab1[%d]) failed\n", i);
//      print_last_CUDA_error();
//      return 1;
//    }
//  }
  if( cudaHostAlloc((void**)&(mystuff.h_RES), RESULTS_ARRAY_SIZE * sizeof(int), 0) != cudaSuccess )
  {
    printf("ERROR: cudaHostAlloc(h_RES) failed\n");
    print_last_CUDA_error();
    return 1;
  }
  if( cudaMalloc((void**)&(mystuff.d_RES), RESULTS_ARRAY_SIZE * sizeof(int)) != cudaSuccess )
  {
    printf("ERROR: cudaMalloc(d_RES) failed\n");
    print_last_CUDA_error();
    return 1;
  }

//  mystuff.sieve_primes_upper_limit = mystuff.sieve_primes_max;
  if(mystuff.mode == MODE_NORMAL)
  {

/* before we start real work run a small selftest */
//    mystuff.mode = MODE_SELFTEST_SHORT;
//BUG  We have no selftest data for double Mersenne and Fermat numbers
//BUG    printf("running a simple selftest...\n");
//BUG    if(selftest(&mystuff, 1) != 0)return 1; /* selftest failed :( */
    mystuff.mode = MODE_NORMAL;
    
/* signal handler blablabla */
    register_signal_handler(&mystuff);
    
    do
    {
      parse_ret = get_next_assignment(mystuff.workfile, &mystuff.fermat_factoring, &mystuff.exponent, &mystuff.bit_min, &mystuff.bit_max_assignment, &mystuff.k_lower_bound, &mystuff.k_upper_bound, NULL, mystuff.verbosity);

      if (mystuff.fermat_factoring)
        sprintf(mystuff.exponent_string, "k*2^%u+1", mystuff.exponent);
      else
        sprintf(mystuff.exponent_string, "MM%u", mystuff.exponent);

      if(parse_ret == OK)
      {
        if(mystuff.verbosity >= 1) {
  	  if(mystuff.fermat_factoring || mystuff.exponent <= 127)
	  {
	    unsigned long long k_min, k_max;
	    k_min=calculate_k_min(mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_min);
	    k_max=calculate_k_max(mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_max_assignment);
	    if (mystuff.k_lower_bound > k_min) k_min = mystuff.k_lower_bound;
	    if (mystuff.k_upper_bound && mystuff.k_upper_bound < k_max) k_max = mystuff.k_upper_bound;
	    if (mystuff.bit_min+1 == mystuff.bit_max_assignment)
	      printf("got assignment: %s, k range %" PRIu64 " to %" PRIu64 " (%d-bit factors)\n", mystuff.exponent_string, k_min, k_max, mystuff.bit_min+1);
	    else
	      printf("got assignment: %s, k range %" PRIu64 " to %" PRIu64 " (%d to %d bit factors)\n", mystuff.exponent_string, k_min, k_max, mystuff.bit_min+1, mystuff.bit_max_assignment);
	  }
//	  else
//	    printf("got assignment: %s bit_min=%d bit_max=%d (%.2f GHz-days)\n", mystuff.exponent_string, mystuff.bit_min, mystuff.bit_max_assignment, primenet_ghzdays(mystuff.exponent, mystuff.bit_min, mystuff.bit_max_assignment));
	}

	mystuff.bit_max_stage = mystuff.bit_max_assignment;

        if(mystuff.stages == 1)
        {
//          while( ((calculate_k(mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_max_stage) - calculate_k(mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_min)) > (250000000ULL * NUM_CLASSES)) && ((mystuff.bit_max_stage - mystuff.bit_min) > 1) )mystuff.bit_max_stage--;
          while( mystuff.bit_max_stage - mystuff.bit_min > 1 )mystuff.bit_max_stage--;
        }
        tmp = 0;
        while(mystuff.bit_max_stage <= mystuff.bit_max_assignment && !mystuff.quit)
        {
          tmp = tf(&mystuff, AUTOSELECT_KERNEL);
          if(tmp == RET_CUDA_ERROR) return 1; /* bail out, we might have a serious problem (detected by cudaGetLastError())... */

          if(tmp != RET_QUIT)
          {
            if( (mystuff.stopafterfactor > 0) && (tmp > 0) )
            {
              mystuff.bit_max_stage = mystuff.bit_max_assignment;
            }

            {
              if(mystuff.bit_max_stage == mystuff.bit_max_assignment)parse_ret = clear_assignment(mystuff.workfile, mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_min, mystuff.bit_max_assignment, mystuff.k_lower_bound, mystuff.k_upper_bound, 0);
              else                                                   parse_ret = clear_assignment(mystuff.workfile, mystuff.fermat_factoring, mystuff.exponent, mystuff.bit_min, mystuff.bit_max_assignment, mystuff.k_lower_bound, mystuff.k_upper_bound, mystuff.bit_max_stage);

                   if(parse_ret == CANT_OPEN_WORKFILE)   printf("ERROR: clear_assignment() / modify_assignment(): can't open \"%s\"\n", mystuff.workfile);
              else if(parse_ret == CANT_OPEN_TEMPFILE)   printf("ERROR: clear_assignment() / modify_assignment(): can't open \"__worktodo__.tmp\"\n");
              else if(parse_ret == ASSIGNMENT_NOT_FOUND) printf("ERROR: clear_assignment() / modify_assignment(): assignment not found in \"%s\"\n", mystuff.workfile);
              else if(parse_ret == CANT_RENAME)          printf("ERROR: clear_assignment() / modify_assignment(): can't rename workfiles\n");
              else if(parse_ret != OK)                   printf("ERROR: clear_assignment() / modify_assignment(): Unknown error (%d)\n", parse_ret);
            }

            mystuff.bit_min = mystuff.bit_max_stage;
            mystuff.bit_max_stage++;
          }
        }
      }
      else if(parse_ret == CANT_OPEN_FILE)             printf("ERROR: get_next_assignment(): can't open \"%s\"\n", mystuff.workfile);
      else if(parse_ret == VALID_ASSIGNMENT_NOT_FOUND) printf("ERROR: get_next_assignment(): no valid assignment found in \"%s\"\n", mystuff.workfile);
      else if(parse_ret != OK)                         printf("ERROR: get_next_assignment(): Unknown error (%d)\n", parse_ret);
    }
    while(parse_ret == OK && !mystuff.quit);
  }

  // Free CUDA results variables
  cudaFree(mystuff.d_RES);
  cudaFree(mystuff.h_RES);

  // Free GPU sieve data structures
  cudaFree(mystuff.d_bitarray);
  cudaFree(mystuff.d_sieve_info);
  cudaFree(mystuff.d_calc_bit_to_clear_info);

  return 0;
}
