/* Optimized Conway's Game of Life
*
* Board:
* - 10,000 X 10,000 cells
* - stored as 2,500 X 5,000 array of blocks
* Cell Block:
* - 4 X 2 cell area of the board stored in a "unsigned char"
* - x=0, y=0 stored in LSB, x=3, y=1 stored in MSB
* Cell Area:
* - 6 x 4 cell area of the board stored as "unsigned char"
* - represent 1 block and 8 neighboring blocks
* - the strange bit arrangment below is to get the most commonly
* used indexes into as few cache lines as possible:
* 00: 4 of block x+1,y-1
* 01: 3 of block x-1,y
* 02: 4 of block x+1,y
* 03: 7 of block x-1,y-1
*
* 03: 0 of block x+1,y
* 05: 7 of block x-1,y
* 06: 0 of block x+1,y+1
* 07: 3 of block x-1,y+1
* 08-11: 4-7 of block x ,y-1
* 12-19: 0-7 of block x, y
* 20-23: 0-3 of block x ,y+1
* Lookup Table:
* - array of blocks (4X2 cells), indexed by area (24 bits)
*/
#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
//#define GATHER_STATS 1
//#define DEBUG_LEVEL 1 //Print terse debug info
#define DEBUG_LEVEL 2 //Print verbose debug info
#define ABS_SIZE_X 10000
#define ABS_SIZE_Y 10000 // 8188/2+2 = 4096
#if (SIZE_X%4) !=0
X size must be multiple of 4
#endif
#if (SIZE_Y%2) !=0
Y size must be multiple of 2
#endif
#define SIZE_X ABS_SIZE_X/4
#define SIZE_Y ABS_SIZE_Y/2
#define LOOKUP_TABLE_SIZE 16777216
/* Timing stuff */
#define TIMER_CLEAR (tv1.tv_sec=tv1.tv_usec=tv2.tv_sec=tv2.tv_usec=0)
#define TIMER_START gettimeofday(&tv1, (struct timezone*)0)
#define TIMER_STOP gettimeofday(&tv2, (struct timezone*)0)
#define TIMER_ELAPSED (tv2.tv_sec-tv1.tv_sec+(tv2.tv_usec-tv1.tv_usec)*1.E-6)
struct timeval tv1,tv2;
/* +1 to allow for right and +2 bottom edge */
unsigned char board[SIZE_X+1][SIZE_Y+2];
/* Lookup (read) cache */
unsigned char bcol1[SIZE_Y+4]; /* One extra at the top and three at the bottom */
unsigned char bcol2[SIZE_Y+4];
unsigned char bcol3[SIZE_Y+4];
/* Store (write) cache */
unsigned char ncol1[SIZE_Y];
unsigned char ncol2[SIZE_Y];
unsigned char ncol3[SIZE_Y];
unsigned int * lookup_stats;
/*
* Count number of neighboring living cells for a given cell
*/
int count_neighbors(int bit_array[6][4], int x, int y) {
int adder,i,j;
adder=0;
for (i=x-1; i<=x+1; i++) {
for (j=y-1; j<=y+1; j++) {
if ((i!=x) || (j!=y)) {
adder=adder+bit_array[i][j];
}
}
}
return(adder);
}
/*
* Translate the optimized area representation into
* the visual representational model
*/
unsigned int translate_area(unsigned int opt_area)
{
unsigned int input_area;
input_area = (opt_area & 0x00f000) >> 5; /* bits 7-10 */
input_area |= (opt_area & 0x0f0000) >> 3; /* bits 11-16 */
input_area |= (opt_area & 0x000f00) >> 7; /* bits 1-4 */
input_area |= (opt_area & 0xf00000) >> 1; /* bits 19-22 */
input_area |= (opt_area & 0x000001) << 5; /* bit 5 */
input_area |= (opt_area & 0x000002) << 5; /* bit 6 */
input_area |= (opt_area & 0x000004) << 15; /* bit 17 */
input_area |= (opt_area & 0x000008) >> 3; /* bit 0 */
input_area |= (opt_area & 0x000010) << 7; /* bit 11 */
input_area |= (opt_area & 0x000020) << 7; /* bit 12 */
input_area |= (opt_area & 0x000040) << 17; /* bit 23 */
input_area |= (opt_area & 0x000080) << 11; /* bit 18 */
return(input_area);
}
/*
* Figure out the result of one life iteration for a life block.
* The input block is 24 bits (unsigned int)
* The output block is 8 bits (char)
* This is only used for creating the lookup table initially.
* This is not part of the optimized code path.
*/
char calc_block(unsigned int opt_area)
{
unsigned int input_area;
int input_cell[6][4];
char result_block;
int result_cell[4][2];
int x,y,i;
input_area=translate_area(opt_area);
/* Populate input_cell array from input_area*/
for (x=0; x<6; x++) {
for (y=0; y<4; y++) {
if ((input_area >> (y*6+x)) & 0x1) {
input_cell[x][y] =1;
} else {
input_cell[x][y] =0;
}
}
}
for (x=1; x<5; x++) {
for (y=1; y<3; y++) {
switch (count_neighbors(input_cell, x, y)) {
case 3:
result_cell[x-1][y-1] =1;
break;
case 2:
if (input_cell[x][y]) {
result_cell[x-1][y-1] =1;
} else {
result_cell[x-1][y-1] =0;
}
break;
default:
result_cell[x-1][y-1]=0;
break;
}
}
}
/* Create result_block from result_cell array */
result_block = 0;
for (x=0; x<4; x++) {
for (y=0; y<2; y++) {
if (result_cell[x][y] == 1) {
result_block |= (0x1 << y*4+x);
}
}
}
return(result_block);
}
print_area(unsigned int opt_area)
{
unsigned int input_area;
int x,y,i;
input_area = translate_area(opt_area);
/* Populate input_cell array from input_area*/
for (y=0; y<4; y++) {
for (x=0; x<6; x++) {
if ((input_area >> (y*6+x)) & 0x1) {
printf("O");
} else {
printf(".");
}
}
printf("\n");
}
}
/*
* Take a numeric block representation and print it:
* - 'O' for a living cell
* - '.' for an empty cell
*/
print_block(char block)
{
int x, y;
for (y=0; y<2; y++) {
for (x=0; x<4; x++) {
if ((block >> (y*4+x)) & 0x1) {
printf("O");
} else {
printf(".");
}
}
printf("\n");
}
}
/*
* Print a rectangular region of the board
* - 'O' for a living cell
* - '.' for an empty cell
*/
print_board_rect(
unsigned char grid[SIZE_X+1][SIZE_Y+2],
unsigned int startx,
unsigned int starty,
unsigned int width,
unsigned int height )
{
unsigned int i,j,k,l,block;
for (i=0; i<height; i++) {
for (l=0; l<2; l++) {
for (j=0; j<width; j++) {
block=grid[startx+j][starty+i];
for (k=0; k<4; k++) {
if ((block >> (l*4+k)) & 0x1) {
printf("O");
} else {
printf(".");
}
}
}
printf("\n");
}
}
}
/*
* Create the lookup table array.
* - First try and mmap a pre-generated file.
* - If the file doesn't exist calculate the table in memory
* dump it to the file and re-mmap it.
*/
char * create_lookup_table(char * filename)
{
unsigned int i;
unsigned int j;
unsigned int k;
char * table;
int fd;
long int rsz;
fd = open(filename, O_RDONLY, 0600);
if (fd<1) {
perror("File open failed on lookup table");
close(fd);
table = (char *)malloc(sizeof(char) * LOOKUP_TABLE_SIZE);
if (! table ) {
return(table);
}
printf("Populating lookup:\n");
for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
if (i%(LOOKUP_TABLE_SIZE/10)==0) printf (".\n", i);
table[i] =calc_block(i);
}
/* Writing out lookup table */
fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600);
write(fd, table, LOOKUP_TABLE_SIZE);
close(fd);
free(table);
printf("Lookup table written -- trying mmap again\n");
fd = open(filename, O_RDONLY, 0600);
if (fd<1) {
perror("File open failed again on lookup table, giving up");
return 0;
}
}
rsz = lseek(fd, 0, SEEK_END);
if(rsz%4096) {
rsz+=4096-(rsz%4096);
}
//table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_VARIABLE|MAP_SHARED,fd,0);
table = mmap(NULL, rsz, PROT_READ, MAP_FILE|MAP_SHARED,fd,0);
if (table==MAP_FAILED) {
perror("Couldn't mmap");
return(NULL);
}
close(fd);
return(table);
}
/*
* Randomly seed the board with 30% living cells (70% empty)
*/
populate_board()
{
int x,y,bit;
srand(91823);
for (y=0; y<SIZE_Y; y++) {
for (x=0; x<SIZE_X; x++) {
board[x][y] = 0;
for (bit=0; bit<8; bit++) {
board[x][y] |= (rand() < (RAND_MAX/3) ? 1 : 0) << bit;
}
}
}
/* Fill the right edge column */
for (y=0; y<SIZE_Y+1; y++) {
board[SIZE_X][y] = 0;
}
/* Fill the bottom edge row */
for (x=0; x<SIZE_X+1; x++) {
board[x][SIZE_Y] = 0;
}
}
/*
* The CALC_COL macro and the mate function are the only routines
* in the optimized code path.
*/
/*
* "loop-unroll" CALC_COL() macro for mate() function
*/
#ifdef GATHER_STATS
#define CALC_COL(NN1, NN2, NN3) \
bcopy(board[x+1], &bcol##NN3[1], SIZE_Y); \
colptr1 = (unsigned int *)bcol##NN1; \
colptr2 = (unsigned int *)bcol##NN2; \
colptr3 = (unsigned int *)bcol##NN3; \
for (y=0; y<SIZE_Y; y++) { \
/* Calculate area value */ \
area2 = (*colptr1 & 0x088880) | \
(*colptr3 & 0x011110); \
area = \
((area2 & 0x019800) >> 10) | \
((area2 & 0x000190) >> 4) | \
((area2 & 0x080000) >> 12) | \
((*colptr2 & 0x0ffff0) << 4); \
ncol##NN1[y] = lookup[area]; \
lookup_stats[area]++; \
colptr1 = (unsigned int *) ((char *) colptr1 + 1); \
colptr2 = (unsigned int *) ((char *) colptr2 + 1); \
colptr3 = (unsigned int *) ((char *) colptr3 + 1); \
}
#else
#define CALC_COL(NN1, NN2, NN3) \
bcopy(board[x+1], &bcol##NN3[1], SIZE_Y); \
colptr1 = (unsigned int *)bcol##NN1; \
colptr2 = (unsigned int *)bcol##NN2; \
colptr3 = (unsigned int *)bcol##NN3; \
for (y=0; y<SIZE_Y; y++) { \
/* Calculate area value */ \
area2 = (*colptr1 & 0x088880) | \
(*colptr3 & 0x011110); \
ncol##NN1[y] = lookup[ \
((area2 & 0x019800) >> 10) | \
((area2 & 0x000190) >> 4) | \
((area2 & 0x080000) >> 12) | \
((*colptr2 & 0x0ffff0) << 4)]; \
colptr1 = (unsigned int *) ((char *) colptr1 + 1); \
colptr2 = (unsigned int *) ((char *) colptr2 + 1); \
colptr3 = (unsigned int *) ((char *) colptr3 + 1); \
}
#endif
/*
* Calculate a new generation for an entire board.
*/
mate(char * lookup)
{
unsigned int x,y;
unsigned int area, area2;
//__unaligned register unsigned int *colptr1,*colptr2, *colptr3;
register unsigned int *colptr1,*colptr2, *colptr3;
/* Set the block values for outside edges */
bzero(bcol1, SIZE_Y+4);
bcopy(board[0], &bcol2[1], SIZE_Y);
bcol2[0] =0;
bcol2[SIZE_Y+1] =0;
bcol2[SIZE_Y+2] =0;
bcol2[SIZE_Y+3] =0;
bcol3[0] =0;
bcol3[SIZE_Y+1] =0;
bcol3[SIZE_Y+2] =0;
bcol3[SIZE_Y+3] =0;
for (x=0; x<SIZE_X; x++) {
CALC_COL(1,2,3);
x++;
if (x>=(SIZE_X)) {
bcopy(ncol1, board[x-1], SIZE_Y);
break;
}
CALC_COL(2,3,1);
x++;
if (x>=(SIZE_X)) {
bcopy(ncol1, board[x-2], SIZE_Y);
bcopy(ncol2, board[x-1], SIZE_Y);
break;
}
CALC_COL(3,1,2);
bcopy(ncol1, board[x-2], SIZE_Y);
bcopy(ncol2, board[x-1], SIZE_Y);
bcopy(ncol3, board[x], SIZE_Y);
}
}
#undef CALC_COL
/*
* main:
* - mmap the lookup table or generate it
* - Randomly populate the board
* - Iterate through board generations
* - Print timing information
*
* - Gather stats and print debug info as requested
*/
int main(int argc, char *argv[]) {
unsigned int i,j;
char * lookup;
float total_time=0.0;
float min_time=10000.0;
float max_time=0.0;
lookup = create_lookup_table("lookup.tbl");
if (! lookup ) {
printf ("Couldn't load lookup table. Exiting.\n");
return -1;
}
#ifdef GATHER_STATS
lookup_stats = (unsigned int *)malloc(sizeof(unsigned int) * LOOKUP_TABLE_SIZE);
for (i=0; i<LOOKUP_TABLE_SIZE; i++) {
lookup_stats[i] =0;
}
#endif
printf("Populating board:\n");
TIMER_CLEAR;
TIMER_START;
populate_board();
TIMER_STOP;
printf("Populating TIME: %lf seconds\n",TIMER_ELAPSED);
#ifdef DEBUG_LEVEL
print_board_rect(board, SIZE_X-15, SIZE_Y-10, 15, 10);
printf("\n");
#endif
printf("Processing board:\n");
for (i=0; i<30; i++) {
TIMER_CLEAR;
TIMER_START;
mate(lookup);
TIMER_STOP;
total_time += TIMER_ELAPSED;
if (TIMER_ELAPSED > max_time)
max_time = TIMER_ELAPSED;
if (TIMER_ELAPSED < min_time)
min_time = TIMER_ELAPSED;
#if DEBUG_LEVEL==2
print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
#endif
printf("Generation %d took %lf seconds\n",i+1, TIMER_ELAPSED);
}
printf("Finished processing board\n");
#ifdef DEBUG_LEVEL
print_board_rect(board, SIZE_X-30, SIZE_Y-30, 30, 30);
printf("\n");
#endif
printf("All generations time: %10lf seconds (%d generations)\n",
total_time,i);
printf("Max generation time: %10lf seconds\n",
max_time);
printf("Min generation time: %10lf seconds\n",
min_time);
printf("Average generation time: %10lf seconds\n",
total_time/i);
#ifdef GATHER_STATS
{
int fd;
for (i=0; i<20; i++) {
printf("lookup_stats[%d] = %ld\n", i, lookup_stats[i]);
}
fd = open("lookup_stats", O_CREAT | O_WRONLY, 0600);
if (fd<1) {
perror("File open failed on lookup table");
} else {
write(fd, lookup_stats, LOOKUP_TABLE_SIZE*4);
}
close(fd);
free(lookup_stats);
}
#endif
munmap(lookup, LOOKUP_TABLE_SIZE);
}