/* Optimized Conway's Game of Life
 *
 * Board:
 *      - 10,000 X 10,000 cells
 *      - stored as 2,500 X 5,000 array of blocks
 * Cell Block:
 *      - 4 X 2 cell area of the board stored in a "unsigned char"
 *      - x=0, y=0 stored in LSB, x=3, y=1 stored in MSB
 * Cell Area:
 *      - 6 x 4 cell area of the board stored as "unsigned char"
 *      - represent 1 block and 8 neighboring blocks
 *      - the strange bit arrangment below is to get the most commonly
 *        used indexes into as few cache lines as possible:
 *           00:   4   of block x+1,y-1
 *           01:   3   of block x-1,y
 *           02:   4   of block x+1,y
 *           03:   7   of block x-1,y-1
 *
 *           03:   0   of block x+1,y
 *           05:   7   of block x-1,y
 *           06:   0   of block x+1,y+1
 *           07:   3   of block x-1,y+1
 *        08-11:   4-7 of block x  ,y-1
 *        12-19:   0-7 of block x, y
 *        20-23:   0-3 of block x  ,y+1
 * Lookup Table:
 *      - array of blocks (4X2 cells), indexed by area (24 bits)
 */

#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>


//#define GATHER_STATS 1
//#define DEBUG_LEVEL 1  //Print terse debug info
#define DEBUG_LEVEL 2  //Print verbose debug info


#define ABS_SIZE_X 10000
#define ABS_SIZE_Y 10000  // 8188/2+2 = 4096 

#if (SIZE_X%4) !=0
X size must be multiple of 4
#endif
#if (SIZE_Y%2) !=0
Y size must be multiple of 2
#endif

#define SIZE_X ABS_SIZE_X/4
#define SIZE_Y ABS_SIZE_Y/2
#define LOOKUP_TABLE_SIZE 16777216

/* Timing stuff */
#define TIMER_CLEAR     (tv1.tv_sec=tv1.tv_usec=tv2.tv_sec=tv2.tv_usec=0)
#define TIMER_START     gettimeofday(&tv1, (struct timezone*)0)
#define TIMER_STOP      gettimeofday(&tv2, (struct timezone*)0)
#define TIMER_ELAPSED   (tv2.tv_sec-tv1.tv_sec+(tv2.tv_usec-tv1.tv_usec)*1.E-6)
struct timeval tv1,tv2;


/* +1 to allow for right and +2 bottom edge */
unsigned char board[SIZE_X+1][SIZE_Y+2];

/* Lookup (read) cache */
unsigned char bcol1[SIZE_Y+4]; /* One extra at the top and three at the bottom */
unsigned char bcol2[SIZE_Y+4];
unsigned char bcol3[SIZE_Y+4];
/* Store (write) cache */
unsigned char ncol1[SIZE_Y];
unsigned char ncol2[SIZE_Y];
unsigned char ncol3[SIZE_Y];

unsigned int * lookup_stats;


/* 
 * Count number of neighboring living cells for a given cell
 */
int count_neighbors(int bit_array[6][4], int x, int y) {
    int adder,i,j;

    adder=0;

    for (i=x-1; i<=x+1; i++) {
        for (j=y-1; j<=y+1; j++) {
            if ((i!=x) || (j!=y)) {
                adder=adder+bit_array[i][j];
            }
        }
    }
    return(adder);
}

/* 
 * Translate the optimized area representation into 
 * the visual representational model 
 */
unsigned int translate_area(unsigned int opt_area)
{
    unsigned int input_area;
    input_area  = (opt_area  & 0x00f000) >> 5;  /* bits 7-10 */
    input_area |= (opt_area  & 0x0f0000) >> 3;  /* bits 11-16 */
    input_area |= (opt_area  & 0x000f00) >> 7;  /* bits 1-4 */
    input_area |= (opt_area  & 0xf00000) >> 1;  /* bits 19-22 */

    input_area |= (opt_area  & 0x000001) << 5; /* bit 5 */
    input_area |= (opt_area  & 0x000002) << 5; /* bit 6 */
    input_area |= (opt_area  & 0x000004) << 15;  /* bit 17 */
    input_area |= (opt_area  & 0x000008) >> 3; /* bit 0 */

    input_area |= (opt_area  & 0x000010) << 7;  /* bit 11 */
    input_area |= (opt_area  & 0x000020) << 7;  /* bit 12 */
    input_area |= (opt_area  & 0x000040) << 17;  /* bit 23 */
    input_area |= (opt_area  & 0x000080) << 11;  /* bit 18 */
    return(input_area);
}

/* 
 * Figure out the result of one life iteration for a life block.
 * The input block is 24 bits (unsigned int)
 * The output block is 8 bits (char)
 * This is only used for creating the lookup table initially.
 * This is not part of the optimized code path.
 */
char calc_block(unsigned int opt_area)
{
    unsigned int input_area;
    int input_cell[6][4];

    char result_block;
    int result_cell[4][2];
    int x,y,i;

    input_area=translate_area(opt_area);

    /* Populate input_cell array from input_area*/
    for (x=0; x<6; x++) {
        for (y=0; y<4; y++) {
            if ((input_area >> (y*6+x)) & 0x1) {
                input_cell[x][y] =1;
            } else {
                input_cell[x][y] =0;
            }
        }
    }

    for (x=1; x<5; x++) {
        for (y=1; y<3; y++) {
            switch (count_neighbors(input_cell, x, y)) {
                case 3:
                    result_cell[x-1][y-1] =1;
                    break;
                case 2:
                    if (input_cell[x][y]) {
                        result_cell[x-1][y-1] =1;
                    } else {
                        result_cell[x-1][y-1] =0;
                    }
                    break;
                default:
                    result_cell[x-1][y-1]=0;
                    break;
            }
        }
    }

    /* Create result_block from result_cell array */
    result_block = 0;
    for (x=0; x<4; x++) {
        for (y=0; y<2; y++) {
            if (result_cell[x][y] == 1) {
                result_block |= (0x1 << y*4+x);
            }
        }
    }

    return(result_block);
}


print_area(unsigned int opt_area)
{
    unsigned int input_area;
    int x,y,i;

    input_area  = translate_area(opt_area);

    /* Populate input_cell array from input_area*/
    for (y=0; y<4; y++) {
        for (x=0; x<6; x++) {
            if ((input_area >> (y*6+x)) & 0x1) {
                printf("O");
            } else {
                printf(".");
            }
        }
        printf("\n");
    }
}

/*
 * Take a numeric block representation and print it:
 *  - 'O' for a living cell
 *  - '.' for an empty cell
 */
print_block(char block)
{
    int x, y;
    for (y=0; y<2; y++) {
        for (x=0; x<4; x++) {
            if ((block >> (y*4+x)) & 0x1) {
                printf("O");
            } else {
                printf(".");
            }
        }
        printf("\n");
    }
}

/*
 * Print a rectangular region of the board
 *  - 'O' for a living cell
 *  - '.' for an empty cell
 */
print_board_rect(
    unsigned char grid[SIZE_X+1][SIZE_Y+2],
    unsigned int startx,
    unsigned int starty,
    unsigned int width,
    unsigned int height )
{
    unsigned int i,j,k,l,block;

    for (i=0; i<height; i++) {
        for (l=0; l<2; l++) {
            for (j=0; j<width; j++) {
                block=grid[startx+j][starty+i];
                for (k=0; k<4; k++) {
                    if ((block >> (l*4+k)) & 0x1) {
                        printf("O");
                    } else {
                        printf(".");
                    }
                }
            }
            printf("\n");
        }
    }
}

/*
 * Create the lookup table array.
 *  - First try and mmap a pre-generated file.
 *  - If the file doesn't exist calculate the table in memory
 *    dump it to the file and re-mmap it.
 */
char * create_lookup_table(char * filename)
{
    unsigned int i;
    unsigned int j;
    unsigned int k;
    char * table;
    int fd;
    long int rsz;

    fd = open(filename, O_RDONLY, 0600);
    if (fd<1) {
        perror("File open failed on lookup table");
        close(fd);

        table = (char *)malloc(sizeof(char) * LOOKUP_TABLE_SIZE);
        if (! table ) {
            return(table);
        }
        printf("Populating lookup:\n");
        for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
            if (i%(LOOKUP_TABLE_SIZE/10)==0) printf (".\n", i);
            table[i] =calc_block(i);
        }

        /* Writing out lookup table */
        fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600);
        write(fd, table, LOOKUP_TABLE_SIZE);
        close(fd);
        free(table);

        printf("Lookup table written -- trying mmap again\n");
        fd = open(filename, O_RDONLY, 0600);
        if (fd<1) {
            perror("File open failed again on lookup table, giving up");
            return 0;
        }
    }

    rsz = lseek(fd, 0, SEEK_END);
    if(rsz%4096) {
        rsz+=4096-(rsz%4096);
    }
    //table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_VARIABLE|MAP_SHARED,fd,0);
    table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_SHARED,fd,0);
    if (table==MAP_FAILED) {
        perror("Couldn't mmap");
        return(NULL);
    }
    close(fd);

    return(table);
}


/*
 * Randomly seed the board with 30% living cells (70% empty)
 */
populate_board()
{
    int x,y,bit;

    srand(91823);
    for (y=0; y<SIZE_Y; y++) {
        for (x=0; x<SIZE_X; x++) {
            board[x][y] = 0;
            for (bit=0; bit<8; bit++) {
                board[x][y] |= (rand() < (RAND_MAX/3) ? 1 : 0) << bit;
            }
        }
    }
    /* Fill the right edge column */
    for (y=0; y<SIZE_Y+1; y++) {
        board[SIZE_X][y] = 0;
    }
    /* Fill the bottom edge row */
    for (x=0; x<SIZE_X+1; x++) {
        board[x][SIZE_Y] = 0;
    }
}


/*
 * The CALC_COL macro and the mate function are the only routines 
 * in the optimized code path.
 */

/*
 * "loop-unroll" CALC_COL() macro for mate() function 
 */
#ifdef GATHER_STATS
#define CALC_COL(NN1, NN2, NN3)                                 \
        bcopy(board[x+1], &bcol##NN3[1], SIZE_Y);               \
        colptr1 = (unsigned int *)bcol##NN1;                    \
        colptr2 = (unsigned int *)bcol##NN2;                    \
        colptr3 = (unsigned int *)bcol##NN3;                    \
        for (y=0; y<SIZE_Y; y++) {                              \
            /* Calculate area value */                          \
            area2  = (*colptr1 & 0x088880) |                    \
                     (*colptr3 & 0x011110);                     \
            area =                                              \
                ((area2 & 0x019800) >> 10) |                    \
                ((area2 & 0x000190) >> 4) |                     \
                ((area2 & 0x080000) >> 12) |                    \
                ((*colptr2 & 0x0ffff0) << 4);                   \
            ncol##NN1[y] = lookup[area];                        \
            lookup_stats[area]++;                               \
            colptr1 = (unsigned int *) ((char *) colptr1 + 1);  \
            colptr2 = (unsigned int *) ((char *) colptr2 + 1);  \
            colptr3 = (unsigned int *) ((char *) colptr3 + 1);  \
        }
#else
#define CALC_COL(NN1, NN2, NN3)                                 \
        bcopy(board[x+1], &bcol##NN3[1], SIZE_Y);               \
        colptr1 = (unsigned int *)bcol##NN1;                    \
        colptr2 = (unsigned int *)bcol##NN2;                    \
        colptr3 = (unsigned int *)bcol##NN3;                    \
        for (y=0; y<SIZE_Y; y++) {                              \
            /* Calculate area value */                          \
            area2  = (*colptr1 & 0x088880) |                    \
                     (*colptr3 & 0x011110);                     \
            ncol##NN1[y] = lookup[                              \
                ((area2 & 0x019800) >> 10) |                    \
                ((area2 & 0x000190) >> 4) |                     \
                ((area2 & 0x080000) >> 12) |                    \
                ((*colptr2 & 0x0ffff0) << 4)];                  \
            colptr1 = (unsigned int *) ((char *) colptr1 + 1);  \
            colptr2 = (unsigned int *) ((char *) colptr2 + 1);  \
            colptr3 = (unsigned int *) ((char *) colptr3 + 1);  \
        }
#endif

/* 
 * Calculate a new generation for an entire board.
 */
mate(char * lookup)
{
    unsigned int x,y;
    unsigned int area, area2;
    //__unaligned register unsigned int *colptr1,*colptr2, *colptr3;
    register unsigned int *colptr1,*colptr2, *colptr3;

    /* Set the block values for outside edges */
    bzero(bcol1, SIZE_Y+4);
    bcopy(board[0], &bcol2[1], SIZE_Y);
    bcol2[0] =0;
    bcol2[SIZE_Y+1] =0;
    bcol2[SIZE_Y+2] =0;
    bcol2[SIZE_Y+3] =0;
    bcol3[0] =0;
    bcol3[SIZE_Y+1] =0;
    bcol3[SIZE_Y+2] =0;
    bcol3[SIZE_Y+3] =0;
    for (x=0; x<SIZE_X; x++) {

        CALC_COL(1,2,3);

        x++;
        if (x>=(SIZE_X)) {
            bcopy(ncol1, board[x-1], SIZE_Y);
            break;
        }

        CALC_COL(2,3,1);

        x++;
        if (x>=(SIZE_X)) {
            bcopy(ncol1, board[x-2], SIZE_Y);
            bcopy(ncol2, board[x-1], SIZE_Y);
            break;
        }

        CALC_COL(3,1,2);

        bcopy(ncol1, board[x-2], SIZE_Y);
        bcopy(ncol2, board[x-1], SIZE_Y);
        bcopy(ncol3, board[x], SIZE_Y);
    }
}

#undef CALC_COL


/*
 * main:
 *  - mmap the lookup table or generate it
 *  - Randomly populate the board
 *  - Iterate through board generations
 *  - Print timing information
 *
 *  - Gather stats and print debug info as requested
 */
int main(int argc, char *argv[]) {
    unsigned int i,j;
    char * lookup;
    float total_time=0.0;
    float min_time=10000.0;
    float max_time=0.0;

    lookup = create_lookup_table("lookup.tbl");
    if (! lookup ) {
        printf ("Couldn't load lookup table. Exiting.\n");
        return -1;
    }

#ifdef GATHER_STATS
    lookup_stats = (unsigned int *)malloc(sizeof(unsigned int) * LOOKUP_TABLE_SIZE);
    for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
        lookup_stats[i] =0;
    }
#endif

    printf("Populating board:\n");
    TIMER_CLEAR;
    TIMER_START;
    populate_board();
    TIMER_STOP;
    printf("Populating TIME: %lf seconds\n",TIMER_ELAPSED);

#ifdef DEBUG_LEVEL    
    print_board_rect(board, SIZE_X-15, SIZE_Y-10, 15, 10);
    printf("\n");
#endif
    printf("Processing board:\n");
    for (i=0; i<30; i++) {
        TIMER_CLEAR;
        TIMER_START;
        mate(lookup);
        TIMER_STOP;
        total_time += TIMER_ELAPSED;
        if (TIMER_ELAPSED > max_time)
            max_time = TIMER_ELAPSED;
        if (TIMER_ELAPSED < min_time)
            min_time = TIMER_ELAPSED;
#if DEBUG_LEVEL==2
        print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
#endif
        printf("Generation %d took %lf seconds\n",i+1, TIMER_ELAPSED);

    }
    printf("Finished processing board\n");
#ifdef DEBUG_LEVEL
    print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
    printf("\n");
#endif

    printf("All generations time:    %10lf seconds (%d generations)\n",
           total_time,i);
    printf("Max generation time:     %10lf seconds\n",
           max_time);
    printf("Min generation time:     %10lf seconds\n",
           min_time);
    printf("Average generation time: %10lf seconds\n",
           total_time/i);

#ifdef GATHER_STATS
    {
        int fd;
        for (i=0; i<20; i++) {
            printf("lookup_stats[%d] = %ld\n", i, lookup_stats[i]);
        }
        fd = open("lookup_stats", O_CREAT | O_WRONLY, 0600);
        if (fd<1) {
            perror("File open failed on lookup table");
        } else {
            write(fd, lookup_stats, LOOKUP_TABLE_SIZE*4);
        }
        close(fd);
        free(lookup_stats);
    }
#endif

    munmap(lookup, LOOKUP_TABLE_SIZE);
}