Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make bit counting more clever #933

Merged
merged 3 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions BENCHMARK.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Benchmarks <!-- omit in toc -->

This document describes how to build and run all benchmarks (solutions) for different operating systems that can be automatically executed.
This document describes how to build and run all benchmarks (solutions) for different operating systems that can be automatically executed.

Some solutions are not included in the automated benchmark runs, either because no Dockerfile is included in the solution, or due to legal reasons. We do strive to include all solutions in the automated benchmark runs, if it is technically and legally possible to do so.

## Table of contents <!-- omit in toc -->

- [What operating system to use?](#what-operating-system-to-use)
- [General working mechanism](#general-working-mechanism)
- [Linux](#linux)
Expand Down Expand Up @@ -349,8 +350,9 @@ make DIRECTORY=PrimeCPP
## Running in unconfined mode

For some interpreted languages (Python, Ruby, NodeJS), docker has a non-zero effect slowing CPU-intensive code.
See https://github.com/moby/moby/issues/41389 for the related docker issue. You can disable some of the sandboxing
See <https://github.com/moby/moby/issues/41389> for the related docker issue. You can disable some of the sandboxing
to obtain near-native performance (at least on Linux) with the `UNCONFINED=1` option:

```bash
make UNCONFINED=1
make DIRECTORY=PrimeMyFavoriteInterpretedLanguage UNCONFINED=1
Expand All @@ -361,15 +363,15 @@ make DIRECTORY=PrimeMyFavoriteInterpretedLanguage UNCONFINED=1
The benchmark suite supports multiple output formats; if no formatter is specified, it will default to the `table` format.
Here are the supported values:

* table
* chart
* csv
* json (this also includes the machine information data)
* minifiedjson (same as json, but not pretty-printed)
- table
- chart
- csv
- json (this also includes the machine information data)
- minifiedjson (same as json, but not pretty-printed)

The output format can be controlled via the `FORMATTER` variable like this:

```
```shell
make FORMATTER=json
make DIRECTORY=PrimeCrystal/solution_1 FORMATTER=csv
```
78 changes: 41 additions & 37 deletions PrimeCUDA/solution_2/primes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,27 @@ __global__ void initialize_buffer(uint64_t blockSize, uint64_t wordCount, sieve_
__global__ void unmark_multiples_threads(uint32_t primeCount, uint32_t *primes, uint64_t halfSize, uint32_t sizeSqrt, sieve_t *sieve)
{
// We unmark every "MAX_THREADS"th prime's multiples, starting with our thread index
for (uint32_t primeIndex = threadIdx.x; primeIndex < primeCount; primeIndex += MAX_THREADS)
for (uint32_t primeIndex = threadIdx.x; primeIndex < primeCount; primeIndex += MAX_THREADS)
{
const uint32_t prime = primes[primeIndex];
const uint64_t primeSquared = uint64_t(prime) * prime;

// Unmark multiples starting at just beyond the square root of the sieve size or the square of the prime,
// Unmark multiples starting at just beyond the square root of the sieve size or the square of the prime,
// whichever is larger.
uint64_t firstUnmarked = primeSquared > sizeSqrt ? primeSquared : ((sizeSqrt / prime + 1) * prime);
// We're marking off odd multiples only, so make sure we start with one of those!
if (!(firstUnmarked & 1))
firstUnmarked += prime;

for (uint64_t index = firstUnmarked >> 1; index <= halfSize; index += prime)
// Clear the bit in the word that corresponds to the last part of the index
for (uint64_t index = firstUnmarked >> 1; index <= halfSize; index += prime)
// Clear the bit in the word that corresponds to the last part of the index
atomicAnd(&sieve[WORD_INDEX(index)], ~(sieve_t(1) << BIT_INDEX(index)));
}
}

__global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, uint64_t halfSize, uint32_t sizeSqrt, uint32_t maxBlockIndex, uint64_t blockSize, sieve_t *sieve)
{
// Calculate the start and end of the block we need to work on, at buffer word boundaries.
// Calculate the start and end of the block we need to work on, at buffer word boundaries.
// Note that the first variable is a number in sieve space...
uint64_t blockStart = uint64_t(blockIdx.x) * blockSize + sizeSqrt;
// ...and the second is an index in the sieve buffer (representing odd numbers only)
Expand All @@ -64,7 +64,7 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
const uint32_t prime = primes[primeIndex];
const uint64_t primeSquared = uint64_t(prime) * prime;

// Unmark multiples starting at just beyond the start of our block or the square of the prime,
// Unmark multiples starting at just beyond the start of our block or the square of the prime,
// whichever is larger.
uint64_t firstUnmarked = primeSquared >= blockStart ? primeSquared : ((blockStart / prime + 1) * prime);
// We're marking off odd multiples only, so make sure we start with one of those!
Expand All @@ -79,18 +79,18 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
continue;

uint64_t wordIndex = WORD_INDEX(index);
uint32_t bitIndex = BIT_INDEX(index);
uint32_t bitIndex = BIT_INDEX(index);
sieve_t bitMask = 0;

do
{
// Check if our bit index has moved past the current word's bits. If so...
if (bitIndex > MAX_BIT_INDEX)
if (bitIndex > MAX_BIT_INDEX)
{
// ...clear the current word's bits that are set in the mask, and move on the next word.
sieve[wordIndex++] &= ~bitMask;
// "Shift bitmask one word to the right" through calculation. It has to be done that way
// in part because our word length may be the maximum the GPU supports (64 bits).
// in part because our word length may be the maximum the GPU supports (64 bits).
bitIndex %= BITS_PER_WORD;
bitMask = sieve_t(1) << bitIndex;
}
Expand All @@ -111,8 +111,8 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
{
#endif // ROLLING_LIMIT > 0

for (uint64_t index = firstUnmarked >> 1; index <= lastIndex; index += prime)
// Clear the bit in the word that corresponds to the last part of the index
for (uint64_t index = firstUnmarked >> 1; index <= lastIndex; index += prime)
// Clear the bit in the word that corresponds to the last part of the index
sieve[WORD_INDEX(index)] &= ~(sieve_t(1) << BIT_INDEX(index));

#if ROLLING_LIMIT > 0
Expand All @@ -121,7 +121,7 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
}
}

class Sieve
class Sieve
{
const uint64_t sieve_size;
const uint64_t half_size;
Expand All @@ -131,7 +131,7 @@ class Sieve
sieve_t *device_sieve_buffer;
sieve_t *host_sieve_buffer;

void unmark_multiples(Parallelization type, uint32_t primeCount, uint32_t *primeList)
void unmark_multiples(Parallelization type, uint32_t primeCount, uint32_t *primeList)
{
// Copy the first (square root of sieve size) buffer bytes to the device
cudaMemcpy(device_sieve_buffer, host_sieve_buffer, (size_sqrt >> 4) + 1, cudaMemcpyHostToDevice);
Expand Down Expand Up @@ -165,10 +165,10 @@ class Sieve
// ...and increase that if the division left a remainder.
if (sieveSpace & SIEVE_BITS_MASK)
wordCount++;

// The number of blocks is the maximum thread count or the number of words, whichever is lower
const uint32_t blockCount = (uint32_t)min(uint64_t(MAX_THREADS), wordCount);

uint64_t blockSize = sieveSpace / blockCount;
// Increase block size if the calculating division left a remainder
if (sieveSpace % blockCount)
Expand All @@ -187,13 +187,13 @@ class Sieve
fprintf(stderr, "WARNING: Parallelization type %d unknown, multiple unmarking skipped!\n\n", to_underlying(type));
break;
}

// Release the device prime list buffer
cudaFree(devicePrimeList);

// Copy the sieve buffer from the device to the host. This function implies a wait for all GPU threads to finish.
cudaMemcpy(host_sieve_buffer, device_sieve_buffer, buffer_byte_size, cudaMemcpyDeviceToHost);

#ifdef DEBUG
printf("- device to host copy of sieve buffer complete.\n");
#endif
Expand All @@ -217,7 +217,7 @@ class Sieve

// The number of blocks is the maximum number of threads or the number of words in the buffer, whichever is lower
const uint32_t blockCount = (uint32_t)min(uint64_t(MAX_THREADS), buffer_word_size);

uint64_t blockSize = buffer_word_size / blockCount;
// Increase block size if the calculating division left a remainder
if (buffer_word_size % blockCount)
Expand All @@ -229,7 +229,7 @@ class Sieve

initialize_buffer<<<blockCount, 1>>>(blockSize, buffer_word_size, device_sieve_buffer);

// Allocate host sieve buffer (odd numbers only) and initialize the bytes up to the square root of the sieve
// Allocate host sieve buffer (odd numbers only) and initialize the bytes up to the square root of the sieve
// size to all 1s.
host_sieve_buffer = (sieve_t *)malloc(buffer_byte_size);
memset(host_sieve_buffer, 255, (size_sqrt >> 4) + 1);
Expand All @@ -242,7 +242,7 @@ class Sieve
#endif
}

~Sieve()
~Sieve()
{
cudaFree(device_sieve_buffer);
free(host_sieve_buffer);
Expand All @@ -267,7 +267,7 @@ class Sieve
{
uint64_t index = factor >> 1;

if (host_sieve_buffer[WORD_INDEX(index)] & (sieve_t(1) << BIT_INDEX(index)))
if (host_sieve_buffer[WORD_INDEX(index)] & (sieve_t(1) << BIT_INDEX(index)))
{
primeList[primeCount++] = factor;

Expand All @@ -283,36 +283,40 @@ class Sieve
return host_sieve_buffer;
}

uint64_t count_primes()
uint64_t count_primes()
{
uint64_t primeCount = 0;
const uint64_t lastWord = WORD_INDEX(half_size);
sieve_t word;

// For all buffer words except the last one, just count the set bits in the word until there are none left.
// We only hold bits for odd numbers in the sieve buffer. However, due to a small "mathematical coincidence"
// bit 0 of word 0 effectively represents the only even prime 2. This means the "count set bits" approach
// bit 0 of word 0 effectively represents the only even prime 2. This means the "count set bits" approach
// in itself yields the correct result.
for (uint64_t index = 0; index < lastWord; index++)
{
word = host_sieve_buffer[index];
while (word)
{
if (word & 1)
primeCount++;

word >>= 1;
// This is Brian Kernighan's algorithm for counting set bits. It uses the fact that subtracting 1 from an
// integer flips all bits right of the right-most set bit, and that set bit itself. So, doing a bitwise
// AND of an integer with "itself minus one" clears the right-most set bit. This reduces the number of
// loop iterations to the number of set bits. There is a one-statement implementation for this using a
// for loop, but we'll be kind to our readers. :)
while (word)
{
word &= word - 1;
primeCount++;
}
}

// For the last word, only count bits up to the (halved) sieve limit
word = host_sieve_buffer[lastWord];
const uint32_t lastBit = BIT_INDEX(half_size);
for (uint32_t index = 0; word && index <= lastBit; index++)
for (uint32_t index = 0; word && index <= lastBit; index++)
{
if (word & 1)
primeCount++;

word >>= 1;
}

Expand All @@ -334,7 +338,7 @@ const std::map<uint64_t, const int> resultsDictionary =
{ 10'000'000'000UL, 455052511 },
};

const std::map<Parallelization, const char *> parallelizationDictionary =
const std::map<Parallelization, const char *> parallelizationDictionary =
{
{ Parallelization::threads, "threads" },
{ Parallelization::blocks, "blocks" }
Expand All @@ -348,12 +352,12 @@ uint64_t determineSieveSize(int argc, char *argv[])

const uint64_t sieveSize = strtoul(argv[1], nullptr, 0);

if (sieveSize == 0)
if (sieveSize == 0)
return DEFAULT_SIEVE_SIZE;

if (resultsDictionary.find(sieveSize) == resultsDictionary.end())
fprintf(stderr, "WARNING: Results cannot be validated for selected sieve size of %zu!\n\n", sieveSize);

return sieveSize;
}

Expand All @@ -364,7 +368,7 @@ void printResults(Parallelization type, uint64_t sieveSize, uint64_t primeCount,
const auto parallelizationEntry = parallelizationDictionary.find(type);
const char *parallelizationLabel = parallelizationEntry != parallelizationDictionary.end() ? parallelizationEntry->second : "unknown";

fprintf(stderr, "Passes: %zu, Time: %lf, Avg: %lf, Word size: %d, Max GPU threads: %d, Type: %s, Limit: %zu, Count: %zu, Validated: %d\n",
fprintf(stderr, "Passes: %zu, Time: %lf, Avg: %lf, Word size: %d, Max GPU threads: %d, Type: %s, Limit: %zu, Count: %zu, Validated: %d\n",
passes,
duration,
duration / passes,
Expand Down Expand Up @@ -411,15 +415,15 @@ int main(int argc, char *argv[])
}
while (duration_cast<seconds>(runTime).count() < 5);
#endif

#ifdef DEBUG
printf("\n");
#endif

const size_t primeCount = sieve->count_primes();

delete sieve;

printResults(type, sieveSize, primeCount, duration_cast<microseconds>(runTime).count() / 1000000.0, passes);
printResults(type, sieveSize, primeCount, duration_cast<microseconds>(runTime).count() / 1000000.0, passes);
}
}
10 changes: 5 additions & 5 deletions PrimeCUDA/solution_2/primes.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#define DEFAULT_SIEVE_SIZE 1'000'000

// Maximum number of parallel threads used on the GPU. The actual number of threads used can be lower,
// Maximum number of parallel threads used on the GPU. The actual number of threads used can be lower,
// if there aren't enough "chunks of work" to keep this number of threads busy.
#define MAX_THREADS 256

Expand All @@ -9,7 +9,7 @@

// Highest prime number for which we'll use a rolling bit mask. This value cannot be higher than the
// number of bits per word. Set to 0 to disable the use of a rolling bit mask. The relevant code won't
// be compiled then, thus reducing performance side effects to zero.
// be compiled then, thus reducing performance side effects to zero.
#define ROLLING_LIMIT 29

// If defined, the code will show debug output and run both unmarking methods once.
Expand All @@ -28,11 +28,11 @@
#endif

#if BITS_PER_WORD == 32
typedef unsigned int sieve_t;
typedef uint32_t sieve_t;
#define MAX_WORD_VALUE UINT32_MAX
#define WORD_SHIFT 5
#elif BITS_PER_WORD == 64
typedef unsigned long long sieve_t;
typedef uint64_t sieve_t;
#define MAX_WORD_VALUE UINT64_MAX
#define WORD_SHIFT 6
#else
Expand All @@ -55,7 +55,7 @@ enum class Parallelization : char

// We have to define this ourselves, as we're not doing C++23 (yet)
template<class TEnum>
constexpr auto to_underlying(TEnum enumValue)
constexpr auto to_underlying(TEnum enumValue)
{
return static_cast<typename std::underlying_type<TEnum>::type>(enumValue);
}