From a3ee93e0365c14f981a3dd39b85c8a7b37703309 Mon Sep 17 00:00:00 2001
From: Elizabeth Hunt <elizabeth.hunt@simponic.xyz>
Date: Mon, 11 Dec 2023 19:24:33 -0700
Subject: hw9

---
 homeworks/hw-8.org |   2 +-
 homeworks/hw-8.pdf | Bin 104359 -> 104484 bytes
 homeworks/hw-8.tex |  14 +--
 homeworks/hw-9.org | 222 +++++++++++++++++++++++++++++++++++++++++++++++
 homeworks/hw-9.pdf | Bin 0 -> 53178 bytes
 homeworks/hw-9.tex | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 480 insertions(+), 8 deletions(-)
 create mode 100644 homeworks/hw-9.org
 create mode 100644 homeworks/hw-9.pdf
 create mode 100644 homeworks/hw-9.tex

(limited to 'homeworks')

diff --git a/homeworks/hw-8.org b/homeworks/hw-8.org
index 92a380f..10c7dd8 100644
--- a/homeworks/hw-8.org
+++ b/homeworks/hw-8.org
@@ -1,4 +1,4 @@
-#+TITLE: Homework 7
+#+TITLE: Homework 8
 #+AUTHOR: Elizabeth Hunt
 #+LATEX_HEADER: \notindent \notag  \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry}
 #+LATEX: \setlength\parindent{0pt}
diff --git a/homeworks/hw-8.pdf b/homeworks/hw-8.pdf
index 601a75b..c14bb2e 100644
Binary files a/homeworks/hw-8.pdf and b/homeworks/hw-8.pdf differ
diff --git a/homeworks/hw-8.tex b/homeworks/hw-8.tex
index 5074689..9071f5b 100644
--- a/homeworks/hw-8.tex
+++ b/homeworks/hw-8.tex
@@ -1,4 +1,4 @@
-% Created 2023-12-09 Sat 21:43
+% Created 2023-12-09 Sat 22:06
 % Intended LaTeX compiler: pdflatex
 \documentclass[11pt]{article}
 \usepackage[utf8]{inputenc}
@@ -15,10 +15,10 @@
 \notindent \notag  \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry}
 \author{Elizabeth Hunt}
 \date{\today}
-\title{Homework 7}
+\title{Homework 8}
 \hypersetup{
  pdfauthor={Elizabeth Hunt},
- pdftitle={Homework 7},
+ pdftitle={Homework 8},
  pdfkeywords={},
  pdfsubject={},
  pdfcreator={Emacs 28.2 (Org mode 9.7-pre)}, 
@@ -29,11 +29,11 @@
 \setlength\parindent{0pt}
 
 \section{Question One}
-\label{sec:orgb6d5cda}
+\label{sec:org800c743}
 See \texttt{UTEST(jacobi, solve\_jacobi)} in \texttt{test/jacobi.t.c} and the entry
 \texttt{Jacobi / Gauss-Siedel -> solve\_jacobi} in the LIZFCM API documentation.
 \section{Question Two}
-\label{sec:org9786314}
+\label{sec:org6121bef}
 We cannot just perform the Jacobi algorithm on a Leslie matrix since
 it is obviously not diagonally dominant - which is a requirement. It is
 certainly not always the case, but, if a Leslie matrix \(L\) is invertible, we can
@@ -53,13 +53,13 @@ but with the con that \(k\) is given by some function on both the convergence cr
 nonzero entries in the matrix - which might end up worse in some cases than the LU decomp approach.
 
 \section{Question Three}
-\label{sec:org0ea87d0}
+\label{sec:org11282e6}
 See \texttt{UTEST(jacobi, gauss\_siedel\_solve)} in \texttt{test/jacobi.t.c} which runs the same
 unit test as \texttt{UTEST(jacobi, solve\_jacobi)} but using the
 \texttt{Jacobi / Gauss-Siedel -> gauss\_siedel\_solve} method as documented in the LIZFCM API reference.
 
 \section{Question Four, Five}
-\label{sec:org8eea2ae}
+\label{sec:org22b52a9}
 We produce the following operation counts (by hackily adding the operation count as the last element
 to the solution vector) and errors - the sum of each vector elements' absolute value away from 1.0
 using the proceeding patch and unit test.
diff --git a/homeworks/hw-9.org b/homeworks/hw-9.org
new file mode 100644
index 0000000..de58d2a
--- /dev/null
+++ b/homeworks/hw-9.org
@@ -0,0 +1,222 @@
+#+TITLE: Homework 9
+#+AUTHOR: Elizabeth Hunt
+#+LATEX_HEADER: \notindent \notag  \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry}
+#+LATEX: \setlength\parindent{0pt}
+#+OPTIONS: toc:nil
+
+* Question One
+
+With a ~matrix_dimension~ set to 700, I consistently see about a 3x improvement in performance on my
+10-thread machine. The serial implementation gives an average ~0.189s~ total runtime, while the below
+parallel implementation runs in about ~0.066s~ after the cpu cache has filled on the first run.
+
+#+BEGIN_SRC c
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define matrix_dimension 700
+
+int n = matrix_dimension;
+float sum;
+
+int main() {
+  float A[n][n];
+  float x0[n];
+  float b[n];
+  float x1[n];
+  float res[n];
+
+  srand((unsigned int)(time(NULL)));
+
+  // not worth parallellization - rand() is not thread-safe
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i][j] = ((float)rand() / (float)(RAND_MAX) * 5.0);
+    }
+    x0[i] = ((float)rand() / (float)(RAND_MAX) * 5.0);
+  }
+
+#pragma omp parallel for private(sum)
+  for (int i = 0; i < n; i++) {
+    sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      sum += fabs(A[i][j]);
+    }
+    A[i][i] += sum;
+  }
+
+#pragma omp parallel for private(sum)
+  for (int i = 0; i < n; i++) {
+    sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      sum += A[i][j];
+    }
+    b[i] = sum;
+  }
+
+  float tol = 0.0001;
+  float error = 10.0 * tol;
+  int maxiter = 100;
+  int iter = 0;
+
+  while (error > tol && iter < maxiter) {
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      float temp_sum = b[i];
+      for (int j = 0; j < n; j++) {
+        temp_sum -= A[i][j] * x0[j];
+      }
+      res[i] = temp_sum;
+      x1[i] = x0[i] + res[i] / A[i][i];
+    }
+
+    sum = 0.0;
+#pragma omp parallel for reduction(+ : sum)
+    for (int i = 0; i < n; i++) {
+      float val = x1[i] - x0[i];
+      sum += val * val;
+    }
+    error = sqrt(sum);
+
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      x0[i] = x1[i];
+    }
+
+    iter++;
+  }
+
+  for (int i = 0; i < n; i++)
+    printf("x[%d] = %6f \t res[%d] = %6f\n", i, x1[i], i, res[i]);
+
+  return 0;
+}
+
+#+END_SRC
+
+* Question Two
+
+I only see lowerings in performance (likely due to overhead) on my machine using OpenMP until
+~matrix_dimension~ becomes quite large, about ~300~ in testing. At ~matrix_dimension=1000~, I see another
+about 3x improvement in total runtime (including initialization & I/O which was untouched, so, even further
+improvements could be made) on my 10-thread machine; from around ~0.174~ seconds to ~.052~.
+
+#+BEGIN_SRC c
+  #include <math.h>
+  #include <stdio.h>
+  #include <stdlib.h>
+  #include <time.h>
+
+  #ifdef _OPENMP
+  #include <omp.h>
+  #else
+  #define omp_get_num_threads() 0
+  #define omp_set_num_threads(int) 0
+  #define omp_get_thread_num() 0
+  #endif
+
+  #define matrix_dimension 1000
+
+  int n = matrix_dimension;
+  float ynrm;
+
+  int main() {
+    float A[n][n];
+    float v0[n];
+    float v1[n];
+    float y[n];
+    //
+    // create a matrix
+    //
+    // not worth parallellization - rand() is not thread-safe
+    srand((unsigned int)(time(NULL)));
+    float a = 5.0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        A[i][j] = ((float)rand() / (float)(RAND_MAX)*a);
+      }
+      v0[i] = ((float)rand() / (float)(RAND_MAX)*a);
+    }
+    //
+    // modify the diagonal entries for diagonal dominance
+    // --------------------------------------------------
+    //
+    for (int i = 0; i < n; i++) {
+      float sum = 0.0;
+      for (int j = 0; j < n; j++) {
+        sum = sum + fabs(A[i][j]);
+      }
+      A[i][i] = A[i][i] + sum;
+    }
+    //
+    // generate a vector of ones
+    // -------------------------
+    //
+    for (int j = 0; j < n; j++) {
+      v0[j] = 1.0;
+    }
+    //
+    // power iteration test
+    // --------------------
+    //
+    float tol = 0.0000001;
+    float error = 10.0 * tol;
+    float lam1, lam0;
+    int maxiter = 100;
+    int iter = 0;
+
+    while (error > tol && iter < maxiter) {
+  #pragma omp parallel for
+      for (int i = 0; i < n; i++) {
+        y[i] = 0;
+        for (int j = 0; j < n; j++) {
+          y[i] = y[i] + A[i][j] * v0[j];
+        }
+      }
+
+      ynrm = 0.0;
+  #pragma omp parallel for reduction(+ : ynrm)
+      for (int i = 0; i < n; i++) {
+        ynrm += y[i] * y[i];
+      }
+      ynrm = sqrt(ynrm);
+
+  #pragma omp parallel for
+      for (int i = 0; i < n; i++) {
+        v1[i] = y[i] / ynrm;
+      }
+
+  #pragma omp parallel for
+      for (int i = 0; i < n; i++) {
+        y[i] = 0.0;
+        for (int j = 0; j < n; j++) {
+          y[i] += A[i][j] * v1[j];
+        }
+      }
+
+      lam1 = 0.0;
+  #pragma omp parallel for reduction(+ : lam1)
+      for (int i = 0; i < n; i++) {
+        lam1 += v1[i] * y[i];
+      }
+
+      error = fabs(lam1 - lam0);
+      lam0 = lam1;
+
+  #pragma omp parallel for
+      for (int i = 0; i < n; i++) {
+        v0[i] = v1[i];
+      }
+
+      iter++;
+    }
+
+    printf("in %d iterations, eigenvalue = %f\n", iter, lam1);
+  }
+#+END_SRC
+
+* Question Three
+[[https://static.simponic.xyz/lizfcm.pdf]]
diff --git a/homeworks/hw-9.pdf b/homeworks/hw-9.pdf
new file mode 100644
index 0000000..4753a3b
Binary files /dev/null and b/homeworks/hw-9.pdf differ
diff --git a/homeworks/hw-9.tex b/homeworks/hw-9.tex
new file mode 100644
index 0000000..9d5693a
--- /dev/null
+++ b/homeworks/hw-9.tex
@@ -0,0 +1,250 @@
+% Created 2023-12-11 Mon 19:24
+% Intended LaTeX compiler: pdflatex
+\documentclass[11pt]{article}
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{graphicx}
+\usepackage{longtable}
+\usepackage{wrapfig}
+\usepackage{rotating}
+\usepackage[normalem]{ulem}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{capt-of}
+\usepackage{hyperref}
+\notindent \notag  \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry}
+\author{Elizabeth Hunt}
+\date{\today}
+\title{Homework 9}
+\hypersetup{
+ pdfauthor={Elizabeth Hunt},
+ pdftitle={Homework 9},
+ pdfkeywords={},
+ pdfsubject={},
+ pdfcreator={Emacs 28.2 (Org mode 9.7-pre)}, 
+ pdflang={English}}
+\begin{document}
+
+\maketitle
+\setlength\parindent{0pt}
+
+\section{Question One}
+\label{sec:org69bed2d}
+
+With a \texttt{matrix\_dimension} set to 700, I consistently see about a 3x improvement in performance on my
+10-thread machine. The serial implementation gives an average \texttt{0.189s} total runtime, while the below
+parallel implementation runs in about \texttt{0.066s} after the cpu cache has filled on the first run.
+
+\begin{verbatim}
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define matrix_dimension 700
+
+int n = matrix_dimension;
+float sum;
+
+int main() {
+  float A[n][n];
+  float x0[n];
+  float b[n];
+  float x1[n];
+  float res[n];
+
+  srand((unsigned int)(time(NULL)));
+
+  // not worth parallellization - rand() is not thread-safe
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i][j] = ((float)rand() / (float)(RAND_MAX) * 5.0);
+    }
+    x0[i] = ((float)rand() / (float)(RAND_MAX) * 5.0);
+  }
+
+#pragma omp parallel for private(sum)
+  for (int i = 0; i < n; i++) {
+    sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      sum += fabs(A[i][j]);
+    }
+    A[i][i] += sum;
+  }
+
+#pragma omp parallel for private(sum)
+  for (int i = 0; i < n; i++) {
+    sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      sum += A[i][j];
+    }
+    b[i] = sum;
+  }
+
+  float tol = 0.0001;
+  float error = 10.0 * tol;
+  int maxiter = 100;
+  int iter = 0;
+
+  while (error > tol && iter < maxiter) {
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      float temp_sum = b[i];
+      for (int j = 0; j < n; j++) {
+        temp_sum -= A[i][j] * x0[j];
+      }
+      res[i] = temp_sum;
+      x1[i] = x0[i] + res[i] / A[i][i];
+    }
+
+    sum = 0.0;
+#pragma omp parallel for reduction(+ : sum)
+    for (int i = 0; i < n; i++) {
+      float val = x1[i] - x0[i];
+      sum += val * val;
+    }
+    error = sqrt(sum);
+
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      x0[i] = x1[i];
+    }
+
+    iter++;
+  }
+
+  for (int i = 0; i < n; i++)
+    printf("x[%d] = %6f \t res[%d] = %6f\n", i, x1[i], i, res[i]);
+
+  return 0;
+}
+
+\end{verbatim}
+
+\section{Question Two}
+\label{sec:orgbeace25}
+
+I only see lowerings in performance (likely due to overhead) on my machine using OpenMP until
+\texttt{matrix\_dimension} becomes quite large, about \texttt{300} in testing. At \texttt{matrix\_dimension=1000}, I see another
+about 3x improvement in total runtime (including initialization \& I/O which was untouched, so, even further
+improvements could be made) on my 10-thread machine; from around \texttt{0.174} seconds to \texttt{.052}.
+
+\begin{verbatim}
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#else
+#define omp_get_num_threads() 0
+#define omp_set_num_threads(int) 0
+#define omp_get_thread_num() 0
+#endif
+
+#define matrix_dimension 1000
+
+int n = matrix_dimension;
+float ynrm;
+
+int main() {
+  float A[n][n];
+  float v0[n];
+  float v1[n];
+  float y[n];
+  //
+  // create a matrix
+  //
+  // not worth parallellization - rand() is not thread-safe
+  srand((unsigned int)(time(NULL)));
+  float a = 5.0;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      A[i][j] = ((float)rand() / (float)(RAND_MAX)*a);
+    }
+    v0[i] = ((float)rand() / (float)(RAND_MAX)*a);
+  }
+  //
+  // modify the diagonal entries for diagonal dominance
+  // --------------------------------------------------
+  //
+  for (int i = 0; i < n; i++) {
+    float sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      sum = sum + fabs(A[i][j]);
+    }
+    A[i][i] = A[i][i] + sum;
+  }
+  //
+  // generate a vector of ones
+  // -------------------------
+  //
+  for (int j = 0; j < n; j++) {
+    v0[j] = 1.0;
+  }
+  //
+  // power iteration test
+  // --------------------
+  //
+  float tol = 0.0000001;
+  float error = 10.0 * tol;
+  float lam1, lam0;
+  int maxiter = 100;
+  int iter = 0;
+
+  while (error > tol && iter < maxiter) {
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      y[i] = 0;
+      for (int j = 0; j < n; j++) {
+        y[i] = y[i] + A[i][j] * v0[j];
+      }
+    }
+
+    ynrm = 0.0;
+#pragma omp parallel for reduction(+ : ynrm)
+    for (int i = 0; i < n; i++) {
+      ynrm += y[i] * y[i];
+    }
+    ynrm = sqrt(ynrm);
+
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      v1[i] = y[i] / ynrm;
+    }
+
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      y[i] = 0.0;
+      for (int j = 0; j < n; j++) {
+        y[i] += A[i][j] * v1[j];
+      }
+    }
+
+    lam1 = 0.0;
+#pragma omp parallel for reduction(+ : lam1)
+    for (int i = 0; i < n; i++) {
+      lam1 += v1[i] * y[i];
+    }
+
+    error = fabs(lam1 - lam0);
+    lam0 = lam1;
+
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      v0[i] = v1[i];
+    }
+
+    iter++;
+  }
+
+  printf("in %d iterations, eigenvalue = %f\n", iter, lam1);
+}
+\end{verbatim}
+
+\section{Question Three}
+\label{sec:org33439e0}
+\url{https://static.simponic.xyz/lizfcm.pdf}
+\end{document}
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2