/* Optimized Conway's Game of Life
 *
 * Board:
 * 	- 10,000 X 10,000 cells
 * 	- stored as 2,500 X 5,000 array of blocks
 * Cell Block:
 * 	- 4 X 2 cell area of the board stored in a "unsigned char"
 * 	- x=0, y=0 stored in LSB, x=3, y=1 stored in MSB
 * Cell Area:
 * 	- 6 x 4 cell area of the board stored as "unsigned char"
 * 	- represent 1 block and 8 neighboring blocks
 * 	- the strange bit arrangment below is to get the most commonly
 * 	  used indexes into as few cache lines as possible:
 * 	     00:   4   of block x+1,y-1
 * 	     01:   3   of block x-1,y
 * 	     02:   4   of block x+1,y
 * 	     03:   7   of block x-1,y-1
 *
 * 	     03:   0   of block x+1,y
 * 	     05:   7   of block x-1,y
 * 	     06:   0   of block x+1,y+1
 * 	     07:   3   of block x-1,y+1
 *        08-11:   4-7 of block x  ,y-1
 *        12-19:   0-7 of block x, y
 *        20-23:   0-3 of block x  ,y+1
 * Lookup Table:
 * 	- array of blocks (4X2 cells), indexed by area (24 bits)
 */

#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>


//#define GATHER_STATS 1
//#define DEBUG_LEVEL 1  //Print terse debug info
#define DEBUG_LEVEL 2  //Print verbose debug info


#define ABS_SIZE_X 10000
#define ABS_SIZE_Y 10000  // 8188/2+2 = 4096 

#if (SIZE_X%4) !=0
X size must be multiple of 4
#endif
#if (SIZE_Y%2) !=0
Y size must be multiple of 2
#endif

#define SIZE_X ABS_SIZE_X/4
#define SIZE_Y ABS_SIZE_Y/2
#define LOOKUP_TABLE_SIZE 16777216

/* Timing stuff */
#define TIMER_CLEAR	(tv1.tv_sec=tv1.tv_usec=tv2.tv_sec=tv2.tv_usec=0)
#define TIMER_START	gettimeofday(&tv1, (struct timezone*)0)
#define TIMER_STOP	gettimeofday(&tv2, (struct timezone*)0)
#define TIMER_ELAPSED   (tv2.tv_sec-tv1.tv_sec+(tv2.tv_usec-tv1.tv_usec)*1.E-6)
struct timeval tv1,tv2;


/* +1 to allow for right and +2 bottom edge */
unsigned char board[SIZE_X+1][SIZE_Y+2];

/* Lookup (read) cache */
unsigned char bcol1[SIZE_Y+4]; /* One extra at the top and three at the bottom */
unsigned char bcol2[SIZE_Y+4];
unsigned char bcol3[SIZE_Y+4];
/* Store (write) cache */
unsigned char ncol1[SIZE_Y];
unsigned char ncol2[SIZE_Y];
unsigned char ncol3[SIZE_Y];

unsigned int * lookup_stats;


/* 
 * Count number of neighboring living cells for a given cell
 */
int count_neighbors(int bit_array[6][4], int x, int y) {
    int adder,i,j;
  
    adder=0;

    for (i=x-1; i<=x+1; i++) {
	for (j=y-1; j<=y+1; j++) {
	    if ((i!=x) || (j!=y)) {
		adder=adder+bit_array[i][j];
	    }
	}
    }
    return(adder);
}

/* 
 * Translate the optimized area representation into 
 * the visual representational model 
 */
unsigned int translate_area(unsigned int opt_area)
{
    unsigned int input_area;
    input_area  = (opt_area  & 0x00f000) >> 5;  /* bits 7-10 */
    input_area |= (opt_area  & 0x0f0000) >> 3;  /* bits 11-16 */
    input_area |= (opt_area  & 0x000f00) >> 7;  /* bits 1-4 */
    input_area |= (opt_area  & 0xf00000) >> 1;  /* bits 19-22 */

    input_area |= (opt_area  & 0x000001) << 5; /* bit 5 */
    input_area |= (opt_area  & 0x000002) << 5; /* bit 6 */
    input_area |= (opt_area  & 0x000004) << 15;  /* bit 17 */
    input_area |= (opt_area  & 0x000008) >> 3; /* bit 0 */

    input_area |= (opt_area  & 0x000010) << 7;  /* bit 11 */
    input_area |= (opt_area  & 0x000020) << 7;  /* bit 12 */
    input_area |= (opt_area  & 0x000040) << 17;  /* bit 23 */
    input_area |= (opt_area  & 0x000080) << 11;  /* bit 18 */
    return(input_area);
}

/* 
 * Figure out the result of one life iteration for a life block.
 * The input block is 24 bits (unsigned int)
 * The output block is 8 bits (char)
 * This is only used for creating the lookup table initially.
 * This is not part of the optimized code path.
 */
char calc_block(unsigned int opt_area) 
{
    unsigned int input_area;
    int input_cell[6][4];
    
    char result_block;
    int result_cell[4][2];
    int x,y,i;

    input_area=translate_area(opt_area);

    /* Populate input_cell array from input_area*/
    for (x=0; x<6; x++) {
	for (y=0; y<4; y++) {
	    if ((input_area >> (y*6+x)) & 0x1) {
		input_cell[x][y] =1;
	    } else {
		input_cell[x][y] =0;
	    }
	}
    }

    for (x=1; x<5; x++) {
	for (y=1; y<3; y++) {
	    switch (count_neighbors(input_cell, x, y)) {
		case 3:
		    result_cell[x-1][y-1] =1;
		    break;
		case 2:
		    if (input_cell[x][y]) {
			result_cell[x-1][y-1] =1;
		    } else {
			result_cell[x-1][y-1] =0;
		    }
		    break;
		default:
		    result_cell[x-1][y-1]=0;
		    break;
	    }
	}
    }

    /* Create result_block from result_cell array */
    result_block = 0;
    for (x=0; x<4; x++) {
	for (y=0; y<2; y++) {
	    if (result_cell[x][y] == 1) {
		result_block |= (0x1 << y*4+x);
	    }
	}
    }

    return(result_block);
}


print_area(unsigned int opt_area)
{
    unsigned int input_area;
    int x,y,i;

    input_area  = translate_area(opt_area);

    /* Populate input_cell array from input_area*/
    for (y=0; y<4; y++) {
	for (x=0; x<6; x++) {
	    if ((input_area >> (y*6+x)) & 0x1) {
		printf("O");
	    } else {
		printf(".");
	    }
	}
	printf("\n");
    }
}

/*
 * Take a numeric block representation and print it:
 *  - 'O' for a living cell
 *  - '.' for an empty cell
 */
print_block(char block)
{
    int x, y;
    for (y=0; y<2; y++) {
	for (x=0; x<4; x++) {
	    if ((block >> (y*4+x)) & 0x1) {
		printf("O");
	    } else {
		printf(".");
	    }
	}
	printf("\n");
    }
}

/*
 * Print a rectangular region of the board
 *  - 'O' for a living cell
 *  - '.' for an empty cell
 */
print_board_rect(
    unsigned char grid[SIZE_X+1][SIZE_Y+2], 
    unsigned int startx, 
    unsigned int starty, 
    unsigned int width, 
    unsigned int height )
{
    unsigned int i,j,k,l,block;

    for (i=0; i<height; i++) {
	for (l=0; l<2; l++) {
	    for (j=0; j<width; j++) {
		block=grid[startx+j][starty+i];
		for (k=0; k<4; k++) {
		    if ((block >> (l*4+k)) & 0x1) {
			printf("O");
		    } else {
			printf(".");
		    }
		}
	    }
	    printf("\n");
	}
    }
}

/*
 * Create the lookup table array.
 *  - First try and mmap a pre-generated file.
 *  - If the file doesn't exist calculate the table in memory
 *    dump it to the file and re-mmap it.
 */
char * create_lookup_table(char * filename)
{
    unsigned int i;
    unsigned int j;
    unsigned int k;
    char * table;
    int fd;
    long int rsz;

    fd = open(filename, O_RDONLY, 0600);
    if (fd<1) {
	perror("File open failed on lookup table");
	close(fd);

	table = (char *)malloc(sizeof(char) * LOOKUP_TABLE_SIZE);
	if (! table ) {
	    return(table);
	}
	printf("Populating lookup:\n");
	for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
	    if (i%(LOOKUP_TABLE_SIZE/10)==0) printf (".\n", i);
	    table[i] =calc_block(i);
	}

	/* Writing out lookup table */
	fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600);
	write(fd, table, LOOKUP_TABLE_SIZE);
	close(fd);
	free(table);

	printf("Lookup table written -- trying mmap again\n");
	fd = open(filename, O_RDONLY, 0600);
	if (fd<1) {
	    perror("File open failed again on lookup table, giving up");
	    return 0;
	}
    }

    rsz = lseek(fd, 0, SEEK_END);
    if(rsz%4096) {
	rsz+=4096-(rsz%4096);
    }
    //table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_VARIABLE|MAP_SHARED,fd,0);
    table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_SHARED,fd,0);
    if (table==MAP_FAILED) {
	perror("Couldn't mmap");
	return(NULL);
    }
    close(fd);

    return(table);
}


/*
 * Randomly seed the board with 30% living cells (70% empty)
 */
populate_board()
{
    int x,y,bit;

    srand(91823);
    for (y=0; y<SIZE_Y; y++) {
	for (x=0; x<SIZE_X; x++) {
	    board[x][y] = 0;
	    for (bit=0; bit<8; bit++) {
		board[x][y] |= (rand() < (RAND_MAX/3) ? 1 : 0) << bit;
	    }
	}
    }
    /* Fill the right edge column */
    for (y=0; y<SIZE_Y+1; y++) {
	board[SIZE_X][y] = 0;
    }
    /* Fill the bottom edge row */
    for (x=0; x<SIZE_X+1; x++) {
	board[x][SIZE_Y] = 0;
    }
}


/*
 * The CALC_COL macro and the mate function are the only routines 
 * in the optimized code path.
 */

/*
 * "loop-unroll" CALC_COL() macro for mate() function 
 */
#ifdef GATHER_STATS
#define CALC_COL(NN1, NN2, NN3) 				\
	bcopy(board[x+1], &bcol##NN3[1], SIZE_Y); 		\
	colptr1 = (unsigned int *)bcol##NN1; 			\
	colptr2 = (unsigned int *)bcol##NN2; 			\
	colptr3 = (unsigned int *)bcol##NN3; 			\
	for (y=0; y<SIZE_Y; y++) { 				\
	    /* Calculate area value */ 				\
	    area2  = (*colptr1 & 0x088880) | 			\
		     (*colptr3 & 0x011110); 			\
	    area = 						\
		((area2 & 0x019800) >> 10) | 			\
		((area2 & 0x000190) >> 4) | 			\
		((area2 & 0x080000) >> 12) | 			\
		((*colptr2 & 0x0ffff0) << 4); 			\
	    ncol##NN1[y] = lookup[area]; 			\
	    lookup_stats[area]++; 				\
	    colptr1 = (unsigned int *) ((char *) colptr1 + 1); 	\
	    colptr2 = (unsigned int *) ((char *) colptr2 + 1); 	\
	    colptr3 = (unsigned int *) ((char *) colptr3 + 1); 	\
        }
#else
#define CALC_COL(NN1, NN2, NN3) 				\
	bcopy(board[x+1], &bcol##NN3[1], SIZE_Y); 		\
	colptr1 = (unsigned int *)bcol##NN1; 			\
	colptr2 = (unsigned int *)bcol##NN2; 			\
	colptr3 = (unsigned int *)bcol##NN3; 			\
	for (y=0; y<SIZE_Y; y++) { 				\
	    /* Calculate area value */ 				\
	    area2  = (*colptr1 & 0x088880) | 			\
		     (*colptr3 & 0x011110); 			\
	    ncol##NN1[y] = lookup[ 				\
		((area2 & 0x019800) >> 10) | 			\
		((area2 & 0x000190) >> 4) | 			\
		((area2 & 0x080000) >> 12) | 			\
		((*colptr2 & 0x0ffff0) << 4)]; 			\
	    colptr1 = (unsigned int *) ((char *) colptr1 + 1); 	\
	    colptr2 = (unsigned int *) ((char *) colptr2 + 1); 	\
	    colptr3 = (unsigned int *) ((char *) colptr3 + 1); 	\
        }
#endif

/* 
 * Calculate a new generation for an entire board.
 */ 
mate(char * lookup)
{
    unsigned int x,y;
    unsigned int area, area2;
    //__unaligned register unsigned int *colptr1,*colptr2, *colptr3;
    register unsigned int *colptr1,*colptr2, *colptr3;

    /* Set the block values for outside edges */
    bzero(bcol1, SIZE_Y+4);
    bcopy(board[0], &bcol2[1], SIZE_Y);
    bcol2[0] =0;
    bcol2[SIZE_Y+1] =0;
    bcol2[SIZE_Y+2] =0;
    bcol2[SIZE_Y+3] =0;
    bcol3[0] =0;
    bcol3[SIZE_Y+1] =0;
    bcol3[SIZE_Y+2] =0;
    bcol3[SIZE_Y+3] =0;
    for (x=0; x<SIZE_X; x++) {

	CALC_COL(1,2,3);

	x++;
	if (x>=(SIZE_X)) {
	    bcopy(ncol1, board[x-1], SIZE_Y);
	    break;
	}

	CALC_COL(2,3,1);

	x++;
	if (x>=(SIZE_X)) {
	    bcopy(ncol1, board[x-2], SIZE_Y);
	    bcopy(ncol2, board[x-1], SIZE_Y);
	    break;
	}

	CALC_COL(3,1,2);

	bcopy(ncol1, board[x-2], SIZE_Y);
	bcopy(ncol2, board[x-1], SIZE_Y);
	bcopy(ncol3, board[x], SIZE_Y);
    }
}

#undef CALC_COL


/*
 * main:
 *  - mmap the lookup table or generate it
 *  - Randomly populate the board
 *  - Iterate through board generations
 *  - Print timing information
 *
 *  - Gather stats and print debug info as requested
 */
int main(int argc, char *argv[]) {
    unsigned int i,j;
    char * lookup;
    float total_time=0.0;
    float min_time=10000.0;
    float max_time=0.0;

    lookup = create_lookup_table("lookup.tbl");
    if (! lookup ) {
	printf ("Couldn't load lookup table. Exiting.\n");
	return -1;
    }

#ifdef GATHER_STATS
    lookup_stats = (unsigned int *)malloc(sizeof(unsigned int) * LOOKUP_TABLE_SIZE);
    for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
	lookup_stats[i] =0;
    }
#endif

    printf("Populating board:\n");
    TIMER_CLEAR;
    TIMER_START;
    populate_board();
    TIMER_STOP;
    printf("Populating TIME: %lf seconds\n",TIMER_ELAPSED);

#ifdef DEBUG_LEVEL    
    print_board_rect(board, SIZE_X-15, SIZE_Y-10, 15, 10);
    printf("\n");
#endif
    printf("Processing board:\n");
    for (i=0; i<30; i++) {
	TIMER_CLEAR;
	TIMER_START;
	mate(lookup);
	TIMER_STOP;
	total_time += TIMER_ELAPSED;
	if (TIMER_ELAPSED > max_time)
	    max_time = TIMER_ELAPSED;
	if (TIMER_ELAPSED < min_time)
	    min_time = TIMER_ELAPSED;
#if DEBUG_LEVEL==2
	print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
#endif
	printf("Generation %d took %lf seconds\n",i+1, TIMER_ELAPSED);

    }
    printf("Finished processing board\n");
#ifdef DEBUG_LEVEL
    print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
    printf("\n");
#endif

    printf("All generations time:    %10lf seconds (%d generations)\n",
	   total_time,i);
    printf("Max generation time:     %10lf seconds\n",
	   max_time);
    printf("Min generation time:     %10lf seconds\n",
	   min_time);
    printf("Average generation time: %10lf seconds\n",
	   total_time/i);

#ifdef GATHER_STATS
    {
	int fd;
	for (i=0; i<20; i++) {
	    printf("lookup_stats[%d] = %ld\n", i, lookup_stats[i]);
	}
	fd = open("lookup_stats", O_CREAT | O_WRONLY, 0600);
	if (fd<1) {
	    perror("File open failed on lookup table");
	} else {
	    write(fd, lookup_stats, LOOKUP_TABLE_SIZE*4);
	}
	close(fd);
	free(lookup_stats);
    }
#endif

    munmap(lookup, LOOKUP_TABLE_SIZE);
}

