16 files changed, 89 insertions, 26 deletions
diff --git a/cuda-global/gol b/cuda-global/gol
index 4fb8b81..85af8de 100755
--- a/cuda-global/gol
+++ b/cuda-global/gol
diff --git a/cuda-global/src/game.cu b/cuda-global/src/game.cu
index b0b5f61..abb5ed9 100644
--- a/cuda-global/src/game.cu
+++ b/cuda-global/src/game.cu
@@ -1,11 +1,12 @@
 #include "game.cuh"
 
+// Count the number of life neighbors a cell has
 __device__ int neighbors(struct GAME game, int x, int y) {
   int n = 0;
 
   for (int dy = -1; dy <= 1; dy++) {
     for (int dx = -1; dx <= 1; dx++) {
-      if (!(dx == 0 && dy == 0) && (x+dx) >= 0 && (y+dy) >= 0 && (x+dx) < game.width+(game.padding*2) && (y+dy) < game.height+(game.padding*2)) {
+      if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (y+dy) > 0 && (x+dx) < game.width+(game.padding*2) && (y+dy) < game.height+(game.padding*2)) {
         if (game.grid[(y+dy) * (game.width+game.padding*2) + (x+dx)]) {
           n++;
         }
@@ -15,11 +16,14 @@ __device__ int neighbors(struct GAME game, int x, int y) {
   return n;
 }
 
+// Compute the next iteration of a board
+// We have to give it the newGrid as a parameter otherwise 
+// each block will be computing its own version of the next grid
 __global__ void next(struct GAME game, unsigned char* newGrid) {
   int idy = blockDim.y * blockIdx.y + threadIdx.y;
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if (idy <= game.height+game.padding*2 && idx <= game.width+game.padding*2) {
+  if (idy < game.height+game.padding*2 && idx < game.width+game.padding*2) {
     int my_neighbors = neighbors(game, idx, idy);
     int my_coord = idy * (game.width+game.padding*2) + idx;
     newGrid[my_coord] = 0; // It's possible that there are artifacts from the last iteration
@@ -37,6 +41,7 @@ __global__ void next(struct GAME game, unsigned char* newGrid) {
   }
 }
 
+// Randomly assign life value to each cell
 void randomize(struct GAME* game) {
   for (int y = game->padding; y < game->height+game->padding; y++) {
     for (int x = game->padding; x < game->width+game->padding; x++) {
diff --git a/cuda-global/src/main.cu b/cuda-global/src/main.cu
index f906b2a..41c2abf 100644
--- a/cuda-global/src/main.cu
+++ b/cuda-global/src/main.cu
@@ -18,7 +18,7 @@
  */
 #define BLOCK 32
 #define PADDING 10
-#define VERBOSE 1
+//#define VERBOSE 1
 #define SEED 100
 
 // gpuErrchk source: https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
@@ -33,14 +33,16 @@ true) {
   }
 }
 
+// Do the simulation
 void simulate(int argc, char** argv) {
   srand(SEED);
-  clock_t totalStart = clock();
+  clock_t global_start = clock();
   char* filename;
   struct GAME game;
   game.padding = PADDING;
   int iterations, log_each_step;
   if (argc == 7) {
+    // Parse the arguments
     filename = argv[2];
     game.width = atoi(argv[3]);
     game.height = atoi(argv[4]);
@@ -60,6 +62,7 @@ void simulate(int argc, char** argv) {
   game.grid = (unsigned char*)malloc(size);
   memset(game.grid, 0, size);
 
+  // Choose where to read initial position
   if (strcmp(filename, "random") == 0) {
     randomize(&game);
   } else {
@@ -68,43 +71,53 @@ void simulate(int argc, char** argv) {
 
   char iteration_file[1024];
 
+  // Allocate device memory
   unsigned char* grid_d;
   unsigned char* newGrid;
   gpuErrchk(cudaMalloc(&grid_d, size));
-  gpuErrchk(cudaMemcpy(grid_d, game.grid, size, cudaMemcpyHostToDevice));
   gpuErrchk(cudaMalloc(&newGrid, size));
+  gpuErrchk(cudaMemcpy(grid_d, game.grid, size, cudaMemcpyHostToDevice)); // Copy the initial grid to the device
+  free(game.grid);
+  game.grid = grid_d; // Use the device copy
 
+  // The grid that we will copy results 
   unsigned char* grid_h = (unsigned char*)malloc(size);
   unsigned char* temp;
 
-  game.grid = grid_d;
-
-  int grid_num = (int)ceil((game.width+(2*game.padding))/(float)BLOCK);
-  dim3 dim_grid(grid_num, grid_num, 1);
+  // Calculate grid width for kernel
+  int grid_width = (int)ceil((game.width+(2*game.padding))/(float)BLOCK);
+  int grid_height = (int)ceil((game.height+(2*game.padding))/(float)BLOCK);
+  dim3 dim_grid(grid_width, grid_height, 1);
   dim3 dim_block(BLOCK, BLOCK, 1);
 
-  cudaEvent_t startLife, stopLife;
-  cudaEventCreate(&startLife);
-  cudaEventCreate(&stopLife);
-  double timeComputingLife = 0;
-  float localTime = 0;
+  // Timing
+  cudaEvent_t start, end;
+  cudaEventCreate(&start);
+  cudaEventCreate(&end);
+  double time_computing_life = 0;
+  float local_time = 0;
 
   for (int i = 0; i <= iterations; i++) {
+    // Iteration 0 will just be the initial grid
     if (i > 0) {
-      cudaEventRecord(startLife);
+      cudaEventRecord(start);
+      // Compute the next grid
       next<<<dim_grid, dim_block>>>(game, newGrid);
-      cudaEventRecord(stopLife);
-      cudaEventSynchronize(stopLife);
-      cudaEventElapsedTime(&localTime, startLife, stopLife);
-      timeComputingLife += localTime/1000;
+      cudaEventRecord(end);
+      cudaEventSynchronize(end);
+      cudaEventElapsedTime(&local_time, start, end);
+      time_computing_life += local_time/1000;
 
+      // Swap game.grid and newGrid
       temp = game.grid;
       game.grid = newGrid;
       newGrid = temp;
     }
     if (log_each_step) {
+      // If we are logging each step, perform IO operations
       gpuErrchk(cudaMemcpy(grid_h, game.grid, size, cudaMemcpyDeviceToHost));
       #ifdef VERBOSE
+        // Print the board without the padding elements
         printf("\n===Iteration %i===\n", i);
         for (int y = game.padding; y < game.height+game.padding; y++) {
           for (int x = game.padding; x < game.width+game.padding; x++) {
@@ -114,6 +127,7 @@ void simulate(int argc, char** argv) {
         }
         printf("===End iteration %i===\n", i);
       #endif
+      // Save to a file
       sprintf(iteration_file, "output/iteration-%07d.bin", i);
       temp = game.grid;
       game.grid = grid_h;
@@ -122,13 +136,7 @@ void simulate(int argc, char** argv) {
     }
   }
 
-  clock_t totalEnd = clock();
-  printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", timeComputingLife, ((double)totalEnd - (double)totalStart)/CLOCKS_PER_SEC);
-
-  cudaFree(&newGrid);
-  cudaFree(&grid_d);
-  cudaFree(&game.grid);
-  free(grid_h);
+  printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, ((double)clock() - (double)global_start)/CLOCKS_PER_SEC);
 }
 
 int main(int argc, char** argv) {
diff --git a/cuda-global/timing-study/output--1000-1000.txt b/cuda-global/timing-study/output--1000-1000.txt
new file mode 100644
index 0000000..3792efc
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-1000.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.169687
+Clock time: 1.560000
diff --git a/cuda-global/timing-study/output--1000-1250.txt b/cuda-global/timing-study/output--1000-1250.txt
new file mode 100644
index 0000000..9081eb5
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-1250.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.254989
+Clock time: 2.240000
diff --git a/cuda-global/timing-study/output--1000-1500.txt b/cuda-global/timing-study/output--1000-1500.txt
new file mode 100644
index 0000000..c2fafe8
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-1500.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.354361
+Clock time: 3.050000
diff --git a/cuda-global/timing-study/output--1000-1750.txt b/cuda-global/timing-study/output--1000-1750.txt
new file mode 100644
index 0000000..557165e
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-1750.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.480174
+Clock time: 4.070000
diff --git a/cuda-global/timing-study/output--1000-2000.txt b/cuda-global/timing-study/output--1000-2000.txt
new file mode 100644
index 0000000..91a9d45
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-2000.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.619636
+Clock time: 5.220000
diff --git a/cuda-global/timing-study/output--1000-250.txt b/cuda-global/timing-study/output--1000-250.txt
new file mode 100644
index 0000000..0808a20
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-250.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.029867
+Clock time: 0.330000
diff --git a/cuda-global/timing-study/output--1000-500.txt b/cuda-global/timing-study/output--1000-500.txt
new file mode 100644
index 0000000..913b15e
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-500.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.059907
+Clock time: 0.540000
diff --git a/cuda-global/timing-study/output--1000-750.txt b/cuda-global/timing-study/output--1000-750.txt
new file mode 100644
index 0000000..e3b8c5d
--- /dev/null
+++ b/cuda-global/timing-study/output--1000-750.txt
@@ -0,0 +1,4 @@
+
+===Timing===
+Time computing life: 0.110954
+Clock time: 1.000000
diff --git a/cuda-global/timing-study/slurm-3610476.err-notch081 b/cuda-global/timing-study/slurm-3610476.err-notch081
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/cuda-global/timing-study/slurm-3610476.err-notch081
diff --git a/cuda-global/timing-study/slurm-3610476.out-notch081 b/cuda-global/timing-study/slurm-3610476.out-notch081
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/cuda-global/timing-study/slurm-3610476.out-notch081
diff --git a/cuda-global/timing-study/slurm-3611549.err-notch081 b/cuda-global/timing-study/slurm-3611549.err-notch081
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/cuda-global/timing-study/slurm-3611549.err-notch081
diff --git a/cuda-global/timing-study/slurm-3611549.out-notch081 b/cuda-global/timing-study/slurm-3611549.out-notch081
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/cuda-global/timing-study/slurm-3611549.out-notch081
diff --git a/cuda-global/timing-study/timing_study.sh b/cuda-global/timing-study/timing_study.sh
new file mode 100755
index 0000000..ef7ebcb
--- /dev/null
+++ b/cuda-global/timing-study/timing_study.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#SBATCH --time=0:30:00 # walltime, abbreviated by -t
+#SBATCH --nodes=1      # number of cluster nodes, abbreviated by -N
+#SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N)
+#SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values
+#SBATCH --ntasks=1    # number of MPI tasks, abbreviated by -n
+# additional information for allocated clusters
+#SBATCH --account=notchpeak-shared-short     # account - abbreviated by -A
+#SBATCH --partition=notchpeak-shared-short  # partition, abbreviated by -p
+#SBATCH --gres=gpu:k80:1
+
+cd $HOME/gol/cuda-global
+
+iterations=1000
+for size in 250 500 750 1000 1250 1500 1750 2000
+do
+  srun ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt
+done