-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Philipp Gschwandtner
committed
Apr 8, 2024
1 parent
d85e598
commit 77e6621
Showing
4 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Assignment 4 | ||
|
||
The goal of this assignment is to get to know the basic functionality of OpenMP. | ||
|
||
## Exercise 1 (1.5 Points) | ||
|
||
### Description | ||
|
||
This exercise consists of implementing a Monte Carlo Pi approximation in OpenMP. | ||
|
||
### Tasks | ||
|
||
1) Review the pthreads-based implementation provided in [monte_carlo_pi/mc_pi_pthreads.c](monte_carlo_pi/mc_pi_pthreads.c). Benchmark it with 1, 2, 4, 6, and 12 threads on LCC3. What can you observe? | ||
2) Implement parallel versions of this approximation using OpenMP. In total, three different versions using the following OpenMP constructs should be provided: | ||
1) `critical` section | ||
2) `atomic` statement | ||
3) `reduction` clause | ||
3) To increase the performance difference among these versions, make sure you increment the samples counter directly, without aggregating to private variables first. | ||
4) Benchmark your OpenMP implementations with the same number of threads using OpenMP's time measurement function. What can you observe? How do those results compare to your earlier measurements? | ||
5) The tool `/usr/bin/time` can be used to get useful information on the properties of a program's execution, e.g. its execution time or the maximum amount of main memory used. Measure the execution time of your OpenMP implementation using `/usr/bin/time -v <program_name>`. Take a look at the output, specifically "user time" and "elapsed (wall clock) time". How do they differ? Does either of them match the time measurement function of OpenMP? | ||
6) Add the wall clock time measurements for 12 threads on LCC3 to the comparison spreadsheet linked on Discord. | ||
|
||
## Exercise 2 (1.5 Points) | ||
|
||
### Description | ||
|
||
In this exercise, you are asked to investigate the effect of false sharing in multi-threaded programs. | ||
|
||
### Tasks | ||
|
||
1) Implement (or copy from Exercise 1) a parallel Monte Carlo PI version that uses a local sum approach, i.e. that first aggregates to a per-thread private variable before using `atomic` to aggregate the entire sum of samples. | ||
2) Create a second version that does not rely on private variables but a single array where each thread gets one element for local sum storage. In memory, the data layout should then look like `[thread_0][thread_1][_thread_2][...]`. | ||
3) Create a third version that continues to use a single array but add padding to it, ensuring that the individual local sum storage locations are separated by unused data, e.g. `[thread_0][N_unused_bytes][thread_1][N_unused_bytes][thread_2][...]`. How you achieve this padding is up to you (there are several implementation possibilities). How large should the padding distance ideally be? | ||
4) Benchmark all three versions (private variable, array, array with padding) and document your results. Also check the L1 cache misses using `perf stat`. Feel free to also check for this effect on your local machines and report the data (including the CPU type!). | ||
5) Enter the wall clock time of each version for 12 threads on LCC3 to the comparison spreadsheet linked on Discord. | ||
|
||
## General Notes | ||
|
||
All the material required by the tasks above (e.g., code, figures, text, etc...) must be part of the solution that is handed in. Your experiments should be reproducible and comparable to your measurements using the solution materials that you hand in. | ||
|
||
**Every** member of your group must be able to explain the given problem, your solution, and possible findings. You may also need to answer detailed questions about any of these aspects. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
CC=gcc | ||
CFLAGS=-Wall -Wextra -Wpedantic -Werror -std=c17 -O3 -march=native -g | ||
LDFLAGS=-lpthread | ||
|
||
TARGETS=mc_pi_pthreads | ||
|
||
.PHONY: all clean | ||
|
||
all: $(TARGETS) | ||
|
||
clean: | ||
$(RM) $(TARGETS) | ||
|
||
%: %.c | ||
$(CC) $(CFLAGS) $< -o $@ $(LDFLAGS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#define _POSIX_C_SOURCE 199506L | ||
|
||
#include <stdlib.h> | ||
#include <stdio.h> | ||
#include <stdint.h> | ||
#include <stdbool.h> | ||
#include <pthread.h> | ||
#include <time.h> | ||
|
||
#define START_SEED 0 | ||
|
||
typedef uint64_t timepoint_t; | ||
typedef uint32_t count_t; | ||
|
||
struct MonteCarloPiInput { | ||
count_t iterations; | ||
uint32_t id; | ||
}; | ||
|
||
static void* monte_carlo_pi(void* arg); | ||
static timepoint_t time_ns(); | ||
static double elapsed_seconds(timepoint_t start, timepoint_t end); | ||
|
||
int main(int argc, char** argv) { | ||
// read input arguments | ||
if (argc > 2) { | ||
fprintf(stderr, "Usage: %s [<thread_count>]\n", argv[0]); | ||
return EXIT_FAILURE; | ||
} | ||
|
||
const int32_t requested_threads = (argc <= 1) ? 1 : atoi(argv[1]); | ||
const uint32_t num_threads = (requested_threads < 1) ? 1 : (uint32_t) requested_threads; | ||
|
||
const count_t total_iterations = ((count_t) 700) * 1000 * 1000; | ||
const count_t per_thread_iterations = total_iterations / num_threads; | ||
|
||
bool simulation_success = true; | ||
|
||
// start time measurement | ||
timepoint_t start = time_ns(); | ||
|
||
// create and start threads | ||
pthread_t threads[num_threads]; | ||
struct MonteCarloPiInput inputs[num_threads]; | ||
int creation_statuses[num_threads]; | ||
|
||
for (uint32_t i = 0; i < num_threads; ++i) { | ||
inputs[i] = (struct MonteCarloPiInput) { | ||
.id = i, | ||
.iterations = per_thread_iterations + ((i == 0) ? (total_iterations % num_threads) : 0) | ||
}; | ||
|
||
creation_statuses[i] = pthread_create(&threads[i], NULL, monte_carlo_pi, (void*) &inputs[i]); | ||
|
||
if (creation_statuses[i] != 0) { | ||
fprintf(stderr, "Failed to create thread %u with error %d\n", i, creation_statuses[i]); | ||
simulation_success = false; | ||
} | ||
} | ||
|
||
// join threads and calculate result | ||
count_t points_in_circle = 0; | ||
for (uint32_t i = 0; i < num_threads; ++i) { | ||
if (creation_statuses[i] == 0) { | ||
count_t* result = NULL; | ||
int status = pthread_join(threads[i], (void**) &result); | ||
|
||
if (status != 0) { | ||
fprintf(stderr, "Joining thread %u failed with error %d\n", i, status); | ||
simulation_success = false; | ||
} else if (result != NULL) { | ||
points_in_circle += *result; | ||
free(result); | ||
} | ||
} | ||
} | ||
|
||
double pi_approximation = 4.0 * (points_in_circle / (double) total_iterations); | ||
|
||
// print result and elapsed time | ||
timepoint_t end = time_ns(); | ||
double elapsed_time = elapsed_seconds(start, end); | ||
|
||
if (simulation_success) { | ||
printf("Approximation of PI took %.3f seconds with %u threads - value: %.10f\n", | ||
elapsed_time, num_threads, pi_approximation); | ||
} else { | ||
printf("Simulation failed!\n"); | ||
} | ||
|
||
return (simulation_success) ? EXIT_SUCCESS : EXIT_FAILURE; | ||
} | ||
|
||
static void* monte_carlo_pi(void* arg) { | ||
struct MonteCarloPiInput* input = (struct MonteCarloPiInput*) arg; | ||
unsigned int seed = START_SEED + input->id; | ||
|
||
count_t* points_in_cricle = calloc(1, sizeof(count_t)); | ||
|
||
for (count_t i = 0; i < input->iterations; ++i) { | ||
float x = (rand_r(&seed) / (float) RAND_MAX); | ||
float y = (rand_r(&seed) / (float) RAND_MAX); | ||
|
||
if (x * x + y * y <= 1.0f) { | ||
++(*points_in_cricle); | ||
} | ||
} | ||
|
||
// implicit call to pthread_exit() | ||
return (void*) points_in_cricle; | ||
} | ||
|
||
static timepoint_t time_ns() { | ||
// Note: We assume that all time information fits into an uint64_t | ||
struct timespec now; | ||
clock_gettime(CLOCK_MONOTONIC, &now); | ||
return ((timepoint_t) now.tv_sec) * ((timepoint_t) 1E9) + now.tv_nsec; | ||
} | ||
|
||
static double elapsed_seconds(timepoint_t start, timepoint_t end) { | ||
return ((start > end) ? start - end : end - start) / 1E9; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters