From a3ee93e0365c14f981a3dd39b85c8a7b37703309 Mon Sep 17 00:00:00 2001 From: Elizabeth Hunt Date: Mon, 11 Dec 2023 19:24:33 -0700 Subject: hw9 --- homeworks/hw-8.org | 2 +- homeworks/hw-8.pdf | Bin 104359 -> 104484 bytes homeworks/hw-8.tex | 14 +-- homeworks/hw-9.org | 222 +++++++++++++++++++++++++++++++++++++++++++++++ homeworks/hw-9.pdf | Bin 0 -> 53178 bytes homeworks/hw-9.tex | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 480 insertions(+), 8 deletions(-) create mode 100644 homeworks/hw-9.org create mode 100644 homeworks/hw-9.pdf create mode 100644 homeworks/hw-9.tex (limited to 'homeworks') diff --git a/homeworks/hw-8.org b/homeworks/hw-8.org index 92a380f..10c7dd8 100644 --- a/homeworks/hw-8.org +++ b/homeworks/hw-8.org @@ -1,4 +1,4 @@ -#+TITLE: Homework 7 +#+TITLE: Homework 8 #+AUTHOR: Elizabeth Hunt #+LATEX_HEADER: \notindent \notag \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry} #+LATEX: \setlength\parindent{0pt} diff --git a/homeworks/hw-8.pdf b/homeworks/hw-8.pdf index 601a75b..c14bb2e 100644 Binary files a/homeworks/hw-8.pdf and b/homeworks/hw-8.pdf differ diff --git a/homeworks/hw-8.tex b/homeworks/hw-8.tex index 5074689..9071f5b 100644 --- a/homeworks/hw-8.tex +++ b/homeworks/hw-8.tex @@ -1,4 +1,4 @@ -% Created 2023-12-09 Sat 21:43 +% Created 2023-12-09 Sat 22:06 % Intended LaTeX compiler: pdflatex \documentclass[11pt]{article} \usepackage[utf8]{inputenc} @@ -15,10 +15,10 @@ \notindent \notag \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry} \author{Elizabeth Hunt} \date{\today} -\title{Homework 7} +\title{Homework 8} \hypersetup{ pdfauthor={Elizabeth Hunt}, - pdftitle={Homework 7}, + pdftitle={Homework 8}, pdfkeywords={}, pdfsubject={}, pdfcreator={Emacs 28.2 (Org mode 9.7-pre)}, @@ -29,11 +29,11 @@ \setlength\parindent{0pt} \section{Question One} -\label{sec:orgb6d5cda} +\label{sec:org800c743} See \texttt{UTEST(jacobi, solve\_jacobi)} in \texttt{test/jacobi.t.c} and the entry \texttt{Jacobi / Gauss-Siedel -> solve\_jacobi} in the LIZFCM API documentation. \section{Question Two} -\label{sec:org9786314} +\label{sec:org6121bef} We cannot just perform the Jacobi algorithm on a Leslie matrix since it is obviously not diagonally dominant - which is a requirement. It is certainly not always the case, but, if a Leslie matrix \(L\) is invertible, we can @@ -53,13 +53,13 @@ but with the con that \(k\) is given by some function on both the convergence cr nonzero entries in the matrix - which might end up worse in some cases than the LU decomp approach. \section{Question Three} -\label{sec:org0ea87d0} +\label{sec:org11282e6} See \texttt{UTEST(jacobi, gauss\_siedel\_solve)} in \texttt{test/jacobi.t.c} which runs the same unit test as \texttt{UTEST(jacobi, solve\_jacobi)} but using the \texttt{Jacobi / Gauss-Siedel -> gauss\_siedel\_solve} method as documented in the LIZFCM API reference. \section{Question Four, Five} -\label{sec:org8eea2ae} +\label{sec:org22b52a9} We produce the following operation counts (by hackily adding the operation count as the last element to the solution vector) and errors - the sum of each vector elements' absolute value away from 1.0 using the proceeding patch and unit test. diff --git a/homeworks/hw-9.org b/homeworks/hw-9.org new file mode 100644 index 0000000..de58d2a --- /dev/null +++ b/homeworks/hw-9.org @@ -0,0 +1,222 @@ +#+TITLE: Homework 9 +#+AUTHOR: Elizabeth Hunt +#+LATEX_HEADER: \notindent \notag \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry} +#+LATEX: \setlength\parindent{0pt} +#+OPTIONS: toc:nil + +* Question One + +With a ~matrix_dimension~ set to 700, I consistently see about a 3x improvement in performance on my +10-thread machine. The serial implementation gives an average ~0.189s~ total runtime, while the below +parallel implementation runs in about ~0.066s~ after the cpu cache has filled on the first run. + +#+BEGIN_SRC c +#include +#include +#include +#include +#include + +#define matrix_dimension 700 + +int n = matrix_dimension; +float sum; + +int main() { + float A[n][n]; + float x0[n]; + float b[n]; + float x1[n]; + float res[n]; + + srand((unsigned int)(time(NULL))); + + // not worth parallellization - rand() is not thread-safe + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + A[i][j] = ((float)rand() / (float)(RAND_MAX) * 5.0); + } + x0[i] = ((float)rand() / (float)(RAND_MAX) * 5.0); + } + +#pragma omp parallel for private(sum) + for (int i = 0; i < n; i++) { + sum = 0.0; + for (int j = 0; j < n; j++) { + sum += fabs(A[i][j]); + } + A[i][i] += sum; + } + +#pragma omp parallel for private(sum) + for (int i = 0; i < n; i++) { + sum = 0.0; + for (int j = 0; j < n; j++) { + sum += A[i][j]; + } + b[i] = sum; + } + + float tol = 0.0001; + float error = 10.0 * tol; + int maxiter = 100; + int iter = 0; + + while (error > tol && iter < maxiter) { +#pragma omp parallel for + for (int i = 0; i < n; i++) { + float temp_sum = b[i]; + for (int j = 0; j < n; j++) { + temp_sum -= A[i][j] * x0[j]; + } + res[i] = temp_sum; + x1[i] = x0[i] + res[i] / A[i][i]; + } + + sum = 0.0; +#pragma omp parallel for reduction(+ : sum) + for (int i = 0; i < n; i++) { + float val = x1[i] - x0[i]; + sum += val * val; + } + error = sqrt(sum); + +#pragma omp parallel for + for (int i = 0; i < n; i++) { + x0[i] = x1[i]; + } + + iter++; + } + + for (int i = 0; i < n; i++) + printf("x[%d] = %6f \t res[%d] = %6f\n", i, x1[i], i, res[i]); + + return 0; +} + +#+END_SRC + +* Question Two + +I only see lowerings in performance (likely due to overhead) on my machine using OpenMP until +~matrix_dimension~ becomes quite large, about ~300~ in testing. At ~matrix_dimension=1000~, I see another +about 3x improvement in total runtime (including initialization & I/O which was untouched, so, even further +improvements could be made) on my 10-thread machine; from around ~0.174~ seconds to ~.052~. + +#+BEGIN_SRC c + #include + #include + #include + #include + + #ifdef _OPENMP + #include + #else + #define omp_get_num_threads() 0 + #define omp_set_num_threads(int) 0 + #define omp_get_thread_num() 0 + #endif + + #define matrix_dimension 1000 + + int n = matrix_dimension; + float ynrm; + + int main() { + float A[n][n]; + float v0[n]; + float v1[n]; + float y[n]; + // + // create a matrix + // + // not worth parallellization - rand() is not thread-safe + srand((unsigned int)(time(NULL))); + float a = 5.0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + A[i][j] = ((float)rand() / (float)(RAND_MAX)*a); + } + v0[i] = ((float)rand() / (float)(RAND_MAX)*a); + } + // + // modify the diagonal entries for diagonal dominance + // -------------------------------------------------- + // + for (int i = 0; i < n; i++) { + float sum = 0.0; + for (int j = 0; j < n; j++) { + sum = sum + fabs(A[i][j]); + } + A[i][i] = A[i][i] + sum; + } + // + // generate a vector of ones + // ------------------------- + // + for (int j = 0; j < n; j++) { + v0[j] = 1.0; + } + // + // power iteration test + // -------------------- + // + float tol = 0.0000001; + float error = 10.0 * tol; + float lam1, lam0; + int maxiter = 100; + int iter = 0; + + while (error > tol && iter < maxiter) { + #pragma omp parallel for + for (int i = 0; i < n; i++) { + y[i] = 0; + for (int j = 0; j < n; j++) { + y[i] = y[i] + A[i][j] * v0[j]; + } + } + + ynrm = 0.0; + #pragma omp parallel for reduction(+ : ynrm) + for (int i = 0; i < n; i++) { + ynrm += y[i] * y[i]; + } + ynrm = sqrt(ynrm); + + #pragma omp parallel for + for (int i = 0; i < n; i++) { + v1[i] = y[i] / ynrm; + } + + #pragma omp parallel for + for (int i = 0; i < n; i++) { + y[i] = 0.0; + for (int j = 0; j < n; j++) { + y[i] += A[i][j] * v1[j]; + } + } + + lam1 = 0.0; + #pragma omp parallel for reduction(+ : lam1) + for (int i = 0; i < n; i++) { + lam1 += v1[i] * y[i]; + } + + error = fabs(lam1 - lam0); + lam0 = lam1; + + #pragma omp parallel for + for (int i = 0; i < n; i++) { + v0[i] = v1[i]; + } + + iter++; + } + + printf("in %d iterations, eigenvalue = %f\n", iter, lam1); + } +#+END_SRC + +* Question Three +[[https://static.simponic.xyz/lizfcm.pdf]] diff --git a/homeworks/hw-9.pdf b/homeworks/hw-9.pdf new file mode 100644 index 0000000..4753a3b Binary files /dev/null and b/homeworks/hw-9.pdf differ diff --git a/homeworks/hw-9.tex b/homeworks/hw-9.tex new file mode 100644 index 0000000..9d5693a --- /dev/null +++ b/homeworks/hw-9.tex @@ -0,0 +1,250 @@ +% Created 2023-12-11 Mon 19:24 +% Intended LaTeX compiler: pdflatex +\documentclass[11pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{graphicx} +\usepackage{longtable} +\usepackage{wrapfig} +\usepackage{rotating} +\usepackage[normalem]{ulem} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{capt-of} +\usepackage{hyperref} +\notindent \notag \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry} +\author{Elizabeth Hunt} +\date{\today} +\title{Homework 9} +\hypersetup{ + pdfauthor={Elizabeth Hunt}, + pdftitle={Homework 9}, + pdfkeywords={}, + pdfsubject={}, + pdfcreator={Emacs 28.2 (Org mode 9.7-pre)}, + pdflang={English}} +\begin{document} + +\maketitle +\setlength\parindent{0pt} + +\section{Question One} +\label{sec:org69bed2d} + +With a \texttt{matrix\_dimension} set to 700, I consistently see about a 3x improvement in performance on my +10-thread machine. The serial implementation gives an average \texttt{0.189s} total runtime, while the below +parallel implementation runs in about \texttt{0.066s} after the cpu cache has filled on the first run. + +\begin{verbatim} +#include +#include +#include +#include +#include + +#define matrix_dimension 700 + +int n = matrix_dimension; +float sum; + +int main() { + float A[n][n]; + float x0[n]; + float b[n]; + float x1[n]; + float res[n]; + + srand((unsigned int)(time(NULL))); + + // not worth parallellization - rand() is not thread-safe + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + A[i][j] = ((float)rand() / (float)(RAND_MAX) * 5.0); + } + x0[i] = ((float)rand() / (float)(RAND_MAX) * 5.0); + } + +#pragma omp parallel for private(sum) + for (int i = 0; i < n; i++) { + sum = 0.0; + for (int j = 0; j < n; j++) { + sum += fabs(A[i][j]); + } + A[i][i] += sum; + } + +#pragma omp parallel for private(sum) + for (int i = 0; i < n; i++) { + sum = 0.0; + for (int j = 0; j < n; j++) { + sum += A[i][j]; + } + b[i] = sum; + } + + float tol = 0.0001; + float error = 10.0 * tol; + int maxiter = 100; + int iter = 0; + + while (error > tol && iter < maxiter) { +#pragma omp parallel for + for (int i = 0; i < n; i++) { + float temp_sum = b[i]; + for (int j = 0; j < n; j++) { + temp_sum -= A[i][j] * x0[j]; + } + res[i] = temp_sum; + x1[i] = x0[i] + res[i] / A[i][i]; + } + + sum = 0.0; +#pragma omp parallel for reduction(+ : sum) + for (int i = 0; i < n; i++) { + float val = x1[i] - x0[i]; + sum += val * val; + } + error = sqrt(sum); + +#pragma omp parallel for + for (int i = 0; i < n; i++) { + x0[i] = x1[i]; + } + + iter++; + } + + for (int i = 0; i < n; i++) + printf("x[%d] = %6f \t res[%d] = %6f\n", i, x1[i], i, res[i]); + + return 0; +} + +\end{verbatim} + +\section{Question Two} +\label{sec:orgbeace25} + +I only see lowerings in performance (likely due to overhead) on my machine using OpenMP until +\texttt{matrix\_dimension} becomes quite large, about \texttt{300} in testing. At \texttt{matrix\_dimension=1000}, I see another +about 3x improvement in total runtime (including initialization \& I/O which was untouched, so, even further +improvements could be made) on my 10-thread machine; from around \texttt{0.174} seconds to \texttt{.052}. + +\begin{verbatim} +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#else +#define omp_get_num_threads() 0 +#define omp_set_num_threads(int) 0 +#define omp_get_thread_num() 0 +#endif + +#define matrix_dimension 1000 + +int n = matrix_dimension; +float ynrm; + +int main() { + float A[n][n]; + float v0[n]; + float v1[n]; + float y[n]; + // + // create a matrix + // + // not worth parallellization - rand() is not thread-safe + srand((unsigned int)(time(NULL))); + float a = 5.0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + A[i][j] = ((float)rand() / (float)(RAND_MAX)*a); + } + v0[i] = ((float)rand() / (float)(RAND_MAX)*a); + } + // + // modify the diagonal entries for diagonal dominance + // -------------------------------------------------- + // + for (int i = 0; i < n; i++) { + float sum = 0.0; + for (int j = 0; j < n; j++) { + sum = sum + fabs(A[i][j]); + } + A[i][i] = A[i][i] + sum; + } + // + // generate a vector of ones + // ------------------------- + // + for (int j = 0; j < n; j++) { + v0[j] = 1.0; + } + // + // power iteration test + // -------------------- + // + float tol = 0.0000001; + float error = 10.0 * tol; + float lam1, lam0; + int maxiter = 100; + int iter = 0; + + while (error > tol && iter < maxiter) { +#pragma omp parallel for + for (int i = 0; i < n; i++) { + y[i] = 0; + for (int j = 0; j < n; j++) { + y[i] = y[i] + A[i][j] * v0[j]; + } + } + + ynrm = 0.0; +#pragma omp parallel for reduction(+ : ynrm) + for (int i = 0; i < n; i++) { + ynrm += y[i] * y[i]; + } + ynrm = sqrt(ynrm); + +#pragma omp parallel for + for (int i = 0; i < n; i++) { + v1[i] = y[i] / ynrm; + } + +#pragma omp parallel for + for (int i = 0; i < n; i++) { + y[i] = 0.0; + for (int j = 0; j < n; j++) { + y[i] += A[i][j] * v1[j]; + } + } + + lam1 = 0.0; +#pragma omp parallel for reduction(+ : lam1) + for (int i = 0; i < n; i++) { + lam1 += v1[i] * y[i]; + } + + error = fabs(lam1 - lam0); + lam0 = lam1; + +#pragma omp parallel for + for (int i = 0; i < n; i++) { + v0[i] = v1[i]; + } + + iter++; + } + + printf("in %d iterations, eigenvalue = %f\n", iter, lam1); +} +\end{verbatim} + +\section{Question Three} +\label{sec:org33439e0} +\url{https://static.simponic.xyz/lizfcm.pdf} +\end{document} \ No newline at end of file -- cgit v1.2.3-70-g09d2