Let's have fun with prime numbers, threads, thread pool, TPL and CUDA?

Let’s have fun with prime numbers? In this post, I would like to share some results I got from using multi-threading with .NET and CUDA to find prime numbers in a range.

My machine:

Intel Core i7-7700HQ CPU @ 2.80GHz
32 GB RAM
Windows 10 Pro
NVIDIA GeForce GTX 1070

It is important to say that I am NOT using the best algorithms here. I know there are better approaches to find prime numbers. Also, I am pretty sure there are a lot of improvements that I could implement in my code. So, take it easy. Right?

The book Pro .NET performance inspired the code in this post.

The starting point

Let’s start with a straightforward sequential implementation.

static void Main()
{
    var sw = new Stopwatch();
    sw.Start();
    var result = PrimesInRange(200, 800000);
    sw.Stop();
    Console.WriteLine($"{result} prime numbers found in {sw.ElapsedMilliseconds / 1000} seconds ({Environment.ProcessorCount} processors).");
}

public static long PrimesInRange(long start, long end)
{
    long result = 0;
    for (var number = start; number < end; number++)
    {
        if (IsPrime(number))
        {
            result++;
        }
    }
    return result;
}

static bool IsPrime(long number)
{
    if (number == 2) return true;
    if (number % 2 == 0) return false;
    for (long divisor = 3; divisor < (number / 2); divisor += 2)
    {
        if (number % divisor == 0)
        {
            return false;
        }
    }
    return true;
}

Time to run: ~76 seconds!

Using Threads

public static long PrimesInRange(long start, long end)
{
    long result = 0;
    var lockObject = new object();

    var range = end - start;
    var numberOfThreads = (long) Environment.ProcessorCount;

    var threads = new Thread[numberOfThreads];
    var chunkSize = range / numberOfThreads;

    for (long i = 0; i < numberOfThreads; i++) 
    { 
        var chunkStart = start + i * chunkSize; 
        var chunkEnd = (i == (numberOfThreads - 1)) ? end : chunkStart + chunkSize; 
        threads[i] = new Thread(() =>
        {
            for (var number = chunkStart; number < chunkEnd; ++number)
            {
                if (IsPrime(number))
                {
                    lock (lockObject)
                    {
                        result++;
                    }
                }
            }
        });

        threads[i].Start();
    }

    foreach (var thread in threads)
    {
        thread.Join();
    }

    return result;
}

This is a naïve implementation. Do you know why? Share your thoughts in the comments.

Time to run: ~23 seconds.

Using Threads (no locks)

public static long PrimesInRange2_1(long start, long end)
{
    //var result = new List();
    var range = end - start;
    var numberOfThreads = (long)Environment.ProcessorCount;

    var threads = new Thread[numberOfThreads];
    var results = new long[numberOfThreads];

    var chunkSize = range / numberOfThreads;

    for (long i = 0; i < numberOfThreads; i++) 
    { 
        var chunkStart = start + i * chunkSize; 
        var chunkEnd = i == (numberOfThreads - 1) ? end : chunkStart + chunkSize; 
        var current = i; 
        
        threads[i] = new Thread(() =>
        {
            results[current] = 0;
            for (var number = chunkStart; number < chunkEnd; ++number)
            {
                if (IsPrime(number))
                {
                    results[current]++;
                }
            }
        });

        threads[i].Start();
    }

    foreach (var thread in threads)
    {
        thread.Join();
    }

    return results.Sum();
}

Time to run: ~23 seconds.

Using Threads (Interlocked)

public static long PrimesInRange(long start, long end)
{
    long result = 0;
    var range = end - start;
    var numberOfThreads = (long)Environment.ProcessorCount;

    var threads = new Thread[numberOfThreads];

    var chunkSize = range / numberOfThreads;

    for (long i = 0; i < numberOfThreads; i++) 
    { 
        var chunkStart = start + i * chunkSize; 
        var chunkEnd = i == (numberOfThreads - 1) ? end : chunkStart + chunkSize; 
        threads[i] = new Thread(() =>
        {
            for (var number = chunkStart; number < chunkEnd; ++number)
            {
                if (IsPrime(number))
                {
                    Interlocked.Increment(ref result);
                }
            }
        });

        threads[i].Start();
    }

    foreach (var thread in threads)
    {
        thread.Join();
    }

    return result;
}

Time to Run: ~23 seconds.

ThreadPool

public static long PrimesInRange(long start, long end)
{
    long result = 0;
    const long chunkSize = 100;
    var completed = 0;
    var allDone = new ManualResetEvent(initialState: false);

    var chunks = (end - start) / chunkSize;

    for (long i = 0; i < chunks; i++) 
    { 
        var chunkStart = (start) + i * chunkSize; 
        var chunkEnd = i == (chunks - 1) ? end : chunkStart + chunkSize; 
        ThreadPool.QueueUserWorkItem(_ =>
        {
            for (var number = chunkStart; number < chunkEnd; number++)
            {
                if (IsPrime(number))
                {
                    Interlocked.Increment(ref result);
                }
            }

            if (Interlocked.Increment(ref completed) == chunks)
            {
                allDone.Set();
            }
        });
                
    }
    allDone.WaitOne();
    return result;
}

Time to Run: ~16 seconds.

Parallel.For

public static long PrimesInRange4(long start, long end)
{
    long result = 0;
    Parallel.For(start, end, number =>
    {
        if (IsPrime(number))
        {
            Interlocked.Increment(ref result);
        }
    });
    return result;
}

Time to Run: ~16 seconds.

CUDA

#include "device_launch_parameters.h"
#include "cuda_runtime.h"

#include <ctime>
#include <cstdio>


__global__ void primes_in_range(int *result)
{
	const auto number = 200 + (blockIdx.x * blockDim.x) + threadIdx.x;
	if (number >= 800000)
	{
		return;
	}

	if (number % 2 == 0) return;
	for (long divisor = 3; divisor < (number / 2); divisor += 2)
	{
		if (number % divisor == 0)
		{
			return;
		}
	}

	atomicAdd(result, 1);
}

int main()
{
	auto begin = std::clock();

	int *result;
	cudaMallocManaged(&result, 4);
	*result = 0;

	primes_in_range<<<800, 1024>>>(result);
	cudaDeviceSynchronize();

	auto end = std::clock();
	auto duration = double(end - begin) / CLOCKS_PER_SEC * 1000;
	
	printf("%d prime numbers found in %d milliseconds", 
		*result, 
		static_cast<int>(duration)
	);
	
	getchar();
	return 0;
}

Time to Run: Less than 2 seconds.

Time to Action

I strongly recommend you to reproduce this tests on your machine. If you see something that I could do better, please, share your ideas.

I understand that performance is a feature. I will continue to blog about it. Subscribe the contact list, and I will send you an email every week with the new content.

ou entre em CONTATO

Let’s have fun with prime numbers, threads, thread pool, TPL and CUDA?

The starting point

Using Threads

Using Threads (no locks)

Using Threads (Interlocked)

ThreadPool

Parallel.For

CUDA

Time to Action

Elemar Júnior

Excelência tecnológica potencializando resultados.

Elemar Júnior

Excelência tecnológica potencializando resultados.

Falei sobre RavenDB no Podcast da Lambda3

Essa tal felicidade…

Achando a Raiz de Equações usando o Método de Newton (em F#)

As vezes o caminho é ninguém tentar ganhar para que, no fim, todos ganhem

Finalmente, venci o “Boom Boom”, mas ainda tenho que pagar o IPVAv

Tuplas, tuplas – Retornando múltiplos resultados de uma função em C#

Conecte-se com ElemarJR

Crie sua conta

Crie sua conta

Let’s have fun with prime numbers, threads, thread pool, TPL and CUDA?

The starting point

Using Threads

Using Threads (no locks)

Using Threads (Interlocked)

ThreadPool

Parallel.For

CUDA

Time to Action

Elemar Júnior

Elemar Júnior

Conecte-se com ElemarJR

O Poder do Metamodelo para Profissionais Técnicos Avançarem

Crie sua conta

Crie sua conta