prsentation files: tex, gp, and binary pictures and the pdf

master
Denes Matetelki 11 years ago
parent 567e96ae7b
commit f46d548a9e

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 743 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 342 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.7 KiB

@ -0,0 +1,31 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
# set style data lines
set grid
set xlabel 'X'
set yrange [0:0.383]
set ylabel 'Y'
set xrange [-4:4]
set terminal png transparent nocrop medium size 200,200
set output '1d_gauss.png'
plot \
'-' using 1:($2) title '1D discrete gaussian filter' axes x1y1 lt rgb 'blue' lw 2
-4;0
-3;0.006
-2;0.061
-1;0.242
0;0.383
1;0.242
2;0.061
3;0.006
4;0
e

@ -0,0 +1,66 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
set style data lines
set grid
set xlabel 'Number of threads'
set yrange [0:0.53]
set ylabel 'Executions time [s]'
set terminal png transparent nocrop medium size 800,400
set output 'convolution.png'
plot \
'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \
'-' using 1:($2) title 'Serial - optimized' axes x1y1 lt rgb 'red' lw 2, \
'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \
'-' using 1:($2) title 'intel TBB - optimized' axes x1y1 lt rgb 'dark-green' lw 2
1;0.33
2;0.33
3;0.33
4;0.33
5;0.33
6;0.33
7;0.33
8;0.33
9;0.33
10;0.33
e
1;0.25
2;0.25
3;0.25
4;0.25
5;0.25
6;0.25
7;0.25
8;0.25
9;0.25
10;0.25
e
1;0.52
2;0.26
3;0.27
4;0.14
5;0.14
6;0.13
7;0.13
8;0.12
9;0.13
10;0.12
e
1;0.25
2;0.125
3;0.13
4;0.067
5;0.067
6;0.063
7;0.063
8;0.0577
9;0.063
10;0.0577
e

@ -0,0 +1,67 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
set style data lines
set grid
set xlabel 'Number of threads'
set yrange [0:6.5]
set ylabel 'Executions time [s]'
set terminal png transparent nocrop medium size 800,400
set output 'map.png'
plot \
'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \
'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \
'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \
'-' using 1:($2) title 'QtConcurrent' axes x1y1 lt rgb 'dark-green' lw 2
1;0.5
2;0.67
3;0.68
4;0.69
5;0.68
6;0.7
7;0.65
8;0.72
9;0.68
10;0.69
e
1;6.12
2;3.41
3;2.12
4;1.61
5;1.27
6;1.59
7;1.17
8;1.38
9;0.97
10;0.87
e
1;6.25
2;3.45
3;3.17
4;1.62
5;1.6
6;1.6
7;1.6
8;1.51
9;1.66
10;1.32
e
1;3.22
2;2.2
3;1.65
4;1.32
5;1.43
6;1.44
7;0.95
8;0.9
9;0.85
10;0.8
e

@ -0,0 +1,67 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
set style data lines
set grid
set xlabel 'Number of threads'
set yrange [0:0.8]
set ylabel 'Executions time [s]'
set terminal png transparent nocrop medium size 800,400
set output 'reduce.png'
plot \
'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \
'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \
'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \
'-' using 1:($2) title 'QtConcurrent' axes x1y1 lt rgb 'dark-green' lw 2
1;0.67
2;0.67
3;0.61
4;0.68
5;0.59
6;0.7
7;0.67
8;0.7
9;0.65
10;0.68
e
1;0.52
2;0.27
3;0.19
4;0.15
5;0.15
6;0.15
7;0.22
8;0.2
9;0.17
10;0.16
e
1;0.68
2;0.35
3;0.35
4;0.18
5;0.18
6;0.18
7;0.19
8;0.18
9;0.18
10;0.15
e
1;0.25
2;0.18
3;0.15
4;0.15
5;0.15
6;0.17
7;0.17
8;0.16
9;0.17
10;0.17
e

@ -0,0 +1,54 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
set style data lines
set grid
set xlabel 'Number of threads'
set yrange [0:3.5]
set ylabel 'Executions time [s]'
set terminal png transparent nocrop medium size 800,400
set output 'sort1.png'
plot \
'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \
'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \
'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2
1;2.33
2;3.39
3;3.05
4;3.19
5;2.98
6;2.99
7;3.15
8;3.24
9;3.09
10;2.99
e
1;3.04
2;1.6
3;1.66
4;0.84
5;0.71
6;0.65
7;0.7
8;0.61
9;0.56
10;0.51
e
1;3.33
2;1.81
3;1.32
4;1.7
5;1.26
6;0.89
7;0.87
8;0.83
9;0.81
10;0.82
e

@ -0,0 +1,90 @@
#!/usr/bin/gnuplot
#input
set datafile separator ";"
#output
set key top left
set style data lines
set grid
set xlabel 'Number of threads'
set yrange [0:18]
set ylabel 'Executions time [s]'
set terminal png transparent nocrop medium size 800,400
set output 'sort2.png'
plot \
'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \
'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \
'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \
'-' using 1:($2) title 'cusom openMP threashold' axes x1y1 lt rgb 'steelblue' lw 2, \
'-' using 1:($2) title 'cusom openMP deep' axes x1y1 lt rgb 'olivedrab' lw 2, \
'-' using 1:($2) title 'custom QtConcurrent' axes x1y1 lt rgb 'orangered' lw 2
1;0.29
2;0.29
3;0.29
4;0.29
5;0.29
6;0.29
7;0.29
8;0.29
9;0.29
10;0.29
e
1;0.29
2;0.16
3;0.11
4;0.82
5;0.7
6;0.59
7;0.66
8;0.6
9;0.56
10;0.51
e
1;0.29
2;0.18
3;0.13
4;0.69
5;0.94
6;0.87
7;0.83
8;0.8
9;0.84
10;0.82
e
1;14.32
2;11.67
3;6.61
4;5.69
5;5.64
6;4.24
7;3.71
8;3.56
9;3.76
10;3.18
e
1;17.62
2;8.64
3;5.92
4;4.45
5;4.07
6;3.18
7;3.12
8;3.11
9;3.05
10;3.1
e
1;17.39
2;8.91
3;7.93
4;5.74
5;4.64
6;4.19
7;3.97
8;4.68
9;4.85
10;4
e

@ -0,0 +1,84 @@
\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax%
\let\PY@ul=\relax \let\PY@tc=\relax%
\let\PY@bc=\relax \let\PY@ff=\relax}
\def\PY@tok#1{\csname PY@tok@#1\endcsname}
\def\PY@toks#1+{\ifx\relax#1\empty\else%
\PY@tok{#1}\expandafter\PY@toks\fi}
\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{%
\PY@it{\PY@bf{\PY@ff{#1}}}}}}}
\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}}
\def\PY@tok@gd{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}}
\def\PY@tok@gu{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}}
\def\PY@tok@gt{\def\PY@tc##1{\textcolor[rgb]{0.00,0.25,0.82}{##1}}}
\def\PY@tok@gs{\let\PY@bf=\textbf}
\def\PY@tok@gr{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}}
\def\PY@tok@cm{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
\def\PY@tok@vg{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\def\PY@tok@m{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@mh{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@go{\def\PY@tc##1{\textcolor[rgb]{0.50,0.50,0.50}{##1}}}
\def\PY@tok@ge{\let\PY@it=\textit}
\def\PY@tok@vc{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\def\PY@tok@il{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@cs{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
\def\PY@tok@cp{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}}
\def\PY@tok@gi{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}}
\def\PY@tok@gh{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
\def\PY@tok@ni{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}}
\def\PY@tok@nl{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}}
\def\PY@tok@nn{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\def\PY@tok@no{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}}
\def\PY@tok@na{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}}
\def\PY@tok@nb{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@nc{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\def\PY@tok@nd{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
\def\PY@tok@ne{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}}
\def\PY@tok@nf{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\def\PY@tok@si{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
\def\PY@tok@s2{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@vi{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\def\PY@tok@nt{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@nv{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\def\PY@tok@s1{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@sh{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@sc{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@sx{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@bp{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@c1{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
\def\PY@tok@kc{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@c{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
\def\PY@tok@mf{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@err{\def\PY@bc##1{\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{##1}}}
\def\PY@tok@kd{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@ss{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\def\PY@tok@sr{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
\def\PY@tok@mo{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@kn{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@mi{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@gp{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
\def\PY@tok@o{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\def\PY@tok@kr{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@s{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@kp{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@w{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}}
\def\PY@tok@kt{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}}
\def\PY@tok@ow{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
\def\PY@tok@sb{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PY@tok@k{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\def\PY@tok@se{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}}
\def\PY@tok@sd{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\def\PYZbs{\char`\\}
\def\PYZus{\char`\_}
\def\PYZob{\char`\{}
\def\PYZcb{\char`\}}
\def\PYZca{\char`\^}
\def\PYZsh{\char`\#}
\def\PYZpc{\char`\%}
\def\PYZdl{\char`\$}
\def\PYZti{\char`\~}
% for compatibility with earlier versions
\def\PYZat{@}
\def\PYZlb{[}
\def\PYZrb{]}

@ -0,0 +1,988 @@
\documentclass{beamer}
\usepackage[utf8x]{inputenc}
\usepackage{fancyvrb}
\usepackage{color}
\usepackage{graphicx}
\usetheme{Darmstadt}
\title {High-level parallel programming in C++}
\author{Dénes Mátételki}
\institute{www.emerson.com}
\date{March 18, 2012}
\makeatletter
\include{colordefs}
\makeatother
\begin{document}
%----------- slide --------------------------------------------------%
\begin{frame}
\titlepage
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}
\frametitle{Table of contents}
\tableofcontents
\end{frame}
\section{Theory}
\subsection{High level vs. low level}
%----------- slide --------------------------------------------------%
\begin{frame}{Comparison}
\begin{columns}[t]
\column{1.5in}
\begin{block}{High level}
\small
\begin{itemize}
\item Auto scaling-up
\item Threadpool handling, load balancing.
\item Synchronization and mutexes are handled.
\end{itemize}
\end{block}
\column{1.5in}
\begin{block}{Low level}
\small
\begin{itemize}
\item Manual thread creation.
\item Manual joins and mutex handling.
\item Better for event and I/O based threading.
\item Compiler and external library independend.
\end{itemize}
\end{block}
\end{columns}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}{Compared softwares (performance, code complexity)}
\begin{columns}[t]
\column{1.5in}
\begin{block}{Used}
\small
\begin{itemize}
\item Standard c++ (serial examples)
\item openMP\cite{openmp}
\item Intel Thread Building Blocks (TBB)\cite{itbb}
\item QtConcurrent\cite{qtconcurrent}
\end{itemize}
\end{block}
\column{1.5in}
\begin{block}{Skipped}
\small
\begin{itemize}
\item std::thread, std::mutex (c++0x)\cite{cpp_thread}
\item POSIX threads\cite{posix_threads}
\item QThread\cite{qt_thread}
\end{itemize}
\end{block}
\end{columns}
\begin{exampleblock}{Co-existence\cite{itbb_openmp_nativethreads}}
\small
Possible, but the separate threadpools can lead to oversubscription.
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}{Comparison}
\begin{columns}[t]
\column{1.5in}
\begin{block}{openMP}
\small
\begin{itemize}
\item Compiler support needed.
\item C, C++, fortran.
\item Best for bounded loops.
\item No need for big code re-write.
\item Hard to debug.
\item Managed by a non-profit organization.
\end{itemize}
\end{block}
\column{1.5in}
\begin{block}{Intel TBB}
\small
\begin{itemize}
\item Object oriented.
\item Concurrent data types.
\item Parallel algorithms.
\item Work stealing: dynamic load sharing.
\item Relies heavily on templates.
\item Heavy code rewrite is needed.
\end{itemize}
\end{block}
\column{1.5in}
\begin{block}{QtConcurrent}
\small
\begin{itemize}
\item Object oriented
\item Limited number of algorithms.
\item ...
\end{itemize}
\end{block}
\end{columns}
\end{frame}
\subsection{Algorithms}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Used algorithms for testing}
\begin{block}{List}
\begin{itemize}
\item Map - Applies a given function to each element of a container.
\item Reduction - Combines the results of sub-parts.
\item Sort - Puts elements of a list in a certain order.
\end{itemize}
\end{block}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item The used container is an \verb|std::vector<float>|
\item Container size was 60 million with random floats [1, 1000]
\item Execution times are the avareges of 3 executions.
\item Used hardware was an Intel Xeon 64-bit machine with 6 cores (12 threads), 3,4Mz.
\item Compiled with gcc-4.4 and use flags: \verb|-O3| \verb|-ffast-math| \verb|-fwhole-program|
\verb|-fomit-frame-pointer| \verb|-march=native| \verb|-m64|
\end{itemize}
\end{exampleblock}
\end{frame}
\section{Code samples}
\subsection{Map}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Serial map}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{float} \PY{n}{modify}\PY{p}{(}\PY{k+kt}{float} \PY{n}{value}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{return} \PY{l+m+mf}{13.37} \PY{o}{*} \PY{n}{pow}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{value}\PY{p}{)}\PY{p}{,} \PY{n}{log}\PY{p}{(}\PY{n}{value}\PY{p}{)}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{serialMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{n}{modify}\PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item ``chunksize'' equals the size of the data.
\item This modify function will be used by the parallel examples too.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{openMP parallel map}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{openMpMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{size\PYZus{}t} \PY{n}{i}\PY{p}{;}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel for \PYZbs{}}
\PY{c+cp}{ default(shared) private(i) \PYZbs{}}
\PY{c+cp}{ schedule(dynamic, chunkSize) \PYZbs{}}
\PY{c+cp}{ num\PYZus{}threads(numberOfThreads)}
\PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{modify}\PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
Making it run in parallel is just a single pragma line.
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Intel TBB map}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k}{class} \PY{n+nc}{itbbMap} \PY{p}{\PYZob{}}
\PY{k}{public}\PY{o}{:}
\PY{n}{itbbMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{data}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{k}{operator}\PY{p}{(}\PY{p}{)}\PY{p}{(}\PY{k}{const} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{o}{&} \PY{n}{r}\PY{p}{)} \PY{k}{const} \PY{p}{\PYZob{}}
\PY{k}{for}\PY{p}{(} \PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i} \PY{o}{!}\PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+} \PY{p}{)}
\PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{modify}\PY{p}{(}\PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{k}{private}\PY{o}{:}
\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{m\PYZus{}data}\PY{p}{;}
\PY{p}{\PYZcb{}}\PY{p}{;}
\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{task\PYZus{}scheduler\PYZus{}init} \PY{n}{init}\PY{p}{(}\PY{n}{NUMBER\PYZus{}OF\PYZus{}THREADS}\PY{p}{)}\PY{p}{;}
\PY{n}{itbbMap} \PY{n}{im}\PY{p}{(}\PY{n}{data}\PY{p}{)}\PY{p}{;}
\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}for}\PY{p}{(}\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{CHUNK\PYZus{}SIZE}\PY{p}{)}\PY{p}{,} \PY{n}{im}\PY{p}{)}\PY{p}{;}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
Running a functor on chunks in parallel.
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{QtConcurrent map}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{QtMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{blockingMap}\PY{p}{(}\PY{n}{data}\PY{p}{,} \PY{n}{modify}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{n}{QThreadPool}\PY{o}{:}\PY{o}{:}\PY{n}{globalInstance}\PY{p}{(}\PY{p}{)}\PY{o}{-}\PY{o}{>}\PY{n}{setMaxThreadCount}\PY{p}{(}\PY{n}{NUMBER\PYZus{}OF\PYZus{}THREADS}\PY{p}{)}\PY{p}{;}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item Chunksize is 1.
\item Blocks till the iterator reaches the end.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Map execution times}
\begin{center}
\includegraphics[height=5cm]{map.png}
\end{center}
\begin{exampleblock}{Note}
\small
Serial remained the fastest (memory bound?) - No need to paralellize.
\end{exampleblock}
\end{frame}
\subsection{Reduce}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Serial reduce}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{float} \PY{n}{serialReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;}
\PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{k}{if} \PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min}\PY{p}{)}
\PY{n}{min} \PY{o}{=} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;}
\PY{k}{return} \PY{n}{min}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item Minimum value search.
\item Not actually a reduce.
\item Following examples will try to achive this too.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{openMP reduce}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{int} \PY{n}{openMpReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{size\PYZus{}t} \PY{n}{i}\PY{p}{;}
\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>} \PY{n}{separate\PYZus{}results}\PY{p}{(}\PY{n}{numberOfThreads}\PY{p}{,} \PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel \PYZbs{}}
\PY{c+cp}{ default(shared) private(i) \PYZbs{}}
\PY{c+cp}{ num\PYZus{}threads(numberOfThreads)}
\PY{p}{\PYZob{}}
\PY{k+kt}{int} \PY{n}{threadId} \PY{o}{=} \PY{n}{omp\PYZus{}get\PYZus{}thread\PYZus{}num}\PY{p}{(}\PY{p}{)}\PY{p}{;}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp for schedule(dynamic, chunkSize)}
\PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{k}{if} \PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{threadId}\PY{p}{]} \PY{o}{<} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}
\PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{threadId}\PY{p}{]} \PY{o}{=} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;}
\PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{numberOfThreads}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{k}{if} \PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min}\PY{p}{)}
\PY{n}{min} \PY{o}{=} \PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;}
\PY{k}{return} \PY{n}{min}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Intel TBB reduce}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k}{class} \PY{n+nc}{itbbReduce} \PY{p}{\PYZob{}}
\PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{m\PYZus{}data}\PY{p}{;}
\PY{k}{public}\PY{o}{:}
\PY{k+kt}{float} \PY{n}{m\PYZus{}min}\PY{p}{;}
\PY{n}{itbbReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} \PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{data}\PY{p}{)} \PY{p}{,} \PY{n}{m\PYZus{}min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}}
\PY{n}{itbbReduce}\PY{p}{(}\PY{n}{itbbReduce}\PY{o}{&} \PY{n}{other}\PY{p}{,} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{split}\PY{p}{)} \PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}data}\PY{p}{)}\PY{p}{,} \PY{n}{m\PYZus{}min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{k}{operator}\PY{p}{(}\PY{p}{)}\PY{p}{(}\PY{k}{const} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{o}{&} \PY{n}{r}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{k+kt}{float} \PY{n}{min} \PY{o}{=} \PY{n}{m\PYZus{}min}\PY{p}{;}
\PY{k}{for}\PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i} \PY{o}{!}\PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{k}{if} \PY{p}{(} \PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min} \PY{p}{)}
\PY{n}{min} \PY{o}{=} \PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;}
\PY{n}{m\PYZus{}min} \PY{o}{=} \PY{n}{min}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{join}\PY{p}{(}\PY{k}{const} \PY{n}{itbbReduce}\PY{o}{&} \PY{n}{other}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(} \PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}min} \PY{o}{<} \PY{n}{m\PYZus{}min} \PY{p}{)}
\PY{n}{m\PYZus{}min} \PY{o}{=} \PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}min}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}\PY{p}{;}
\PY{n}{itbbReduce} \PY{n}{mif}\PY{p}{(}\PY{n}{data}\PY{p}{)}\PY{p}{;}
\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}reduce}\PY{p}{(}\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{CHUNK\PYZus{}SIZE}\PY{p}{)}\PY{p}{,} \PY{n}{mif}\PY{p}{)}\PY{p}{;}
\PY{k+kt}{float} \PY{n}{min} \PY{o}{=} \PY{n}{mif}\PY{p}{.}\PY{n}{m\PYZus{}min}\PY{p}{;}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{QtConcurrent reduce}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{findMinimum}\PY{p}{(}\PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{:}\PY{o}{:}\PY{n}{const\PYZus{}iterator} \PY{n}{begin}\PY{p}{,}
\PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{:}\PY{o}{:}\PY{n}{const\PYZus{}iterator} \PY{n}{end}\PY{p}{,}
\PY{k+kt}{float} \PY{o}{*}\PY{n}{result}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{result} \PY{o}{=} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{min\PYZus{}element}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{end}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{k+kt}{float} \PY{n}{QtReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>} \PY{n}{separate\PYZus{}results}\PY{p}{(}\PY{n}{numberOfThreads}\PY{p}{,} \PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;}
\PY{n}{QFutureSynchronizer}\PY{o}{<}\PY{k+kt}{void}\PY{o}{>} \PY{n}{synchronizer}\PY{p}{;}
\PY{k}{for}\PY{p}{(}\PY{k+kt}{int} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{numberOfThreads}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)}
\PY{n}{synchronizer}\PY{p}{.}\PY{n}{addFuture}\PY{p}{(}\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{findLocalMinimum}\PY{p}{,}
\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{n}{i}\PY{o}{*}\PY{n}{chunkSize}\PY{p}{,}
\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{p}{(}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{1}\PY{p}{)}\PY{o}{*}\PY{n}{chunkSize}\PY{p}{,}
\PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{data}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{n}{i}\PY{p}{)}\PY{p}{)}\PY{p}{;}
\PY{n}{synchronizer}\PY{p}{.}\PY{n}{waitForFinished}\PY{p}{(}\PY{p}{)}\PY{p}{;}
\PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;}
\PY{n}{findMinimum}\PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{min}\PY{p}{)}\PY{p}{;}
\PY{k}{return} \PY{n}{min}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Reduce execution times}
\begin{center}
\includegraphics[height=5cm]{reduce.png}
\end{center}
\begin{exampleblock}{Note}
\small
No need for more than 4 threads.
\end{exampleblock}
\end{frame}
\subsection{Sort}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Serial sort}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{serialSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note: quicksort}
\small
\begin{itemize}
\item Pick a pivot point.
\item Partition: Swap elements compared to pivot point.
\item Recursively calls itself with the 2 new partitions.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{openMP, Intel TBB sort}
\begin{block}{openMP c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{include <parallel}\PY{c+cp}{/}\PY{c+cp}{algorithm>}
\PY{k+kt}{void} \PY{n}{openMpSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{\PYZus{}\PYZus{}gnu\PYZus{}parallel}\PY{o}{:}\PY{o}{:}\PY{n}{sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\begin{exampleblock}{Note}
\small
Some algorithms are already rewritten to work in parallel with openMP.
\end{exampleblock}
\begin{block}{Intel TBB c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{itbbSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Sort execution times}
\begin{center}
\includegraphics[height=5cm]{sort1.png}
\end{center}
\begin{exampleblock}{Note}
\small
No need for more than 6 threads.
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Custom QtConcurrent sort}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>}
\PY{k+kt}{long} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{SortType} \PY{n}{outputArray}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k+kt}{long} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}}
\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>}
\PY{k+kt}{void} \PY{n}{QsSequential}\PY{p}{(}\PY{n}{SortType} \PY{n}{array}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}}
\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>}
\PY{k+kt}{void} \PY{n}{QuickSortTask} \PY{p}{(}\PY{n}{SortType} \PY{n}{array}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{right}\PY{p}{,} \PY{k}{const} \PY{k+kt}{int} \PY{n}{deep}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(}\PY{n}{left} \PY{o}{<} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(}\PY{n}{deep}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{k}{const} \PY{k+kt}{long} \PY{n}{part} \PY{o}{=} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{right}\PY{p}{)}\PY{p}{;}
\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{n}{SortType}\PY{o}{>}\PY{p}{,} \PY{n}{array}\PY{p}{,} \PY{n}{part} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{right}\PY{p}{,} \PY{n}{deep} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;}
\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{n}{SortType}\PY{o}{>}\PY{p}{,} \PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{part} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{deep} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}}
\PY{k}{const} \PY{k+kt}{long} \PY{n}{part} \PY{o}{=} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{right}\PY{p}{)}\PY{p}{;}
\PY{n}{QsSequential}\PY{p}{(}\PY{n}{array}\PY{p}{,}\PY{n}{part} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{,}\PY{n}{right}\PY{p}{)}\PY{p}{;}
\PY{n}{QsSequential}\PY{p}{(}\PY{n}{array}\PY{p}{,}\PY{n}{left}\PY{p}{,}\PY{n}{part} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{QtSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{data}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{)}\PY{p}{;}
\PY{n}{QThreadPool}\PY{o}{:}\PY{o}{:}\PY{n}{globalInstance}\PY{p}{(}\PY{p}{)}\PY{o}{-}\PY{o}{>}\PY{n}{waitForDone}\PY{p}{(}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Custom openMP sort}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{nthreshold}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{c+c1}{// parition ...}
\PY{k}{if} \PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1} \PY{o}{<}\PY{o}{=} \PY{n}{nthreshold}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{middle}\PY{p}{)}\PY{p}{;}
\PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{o}{+}\PY{o}{+}\PY{n}{middle}\PY{p}{,} \PY{o}{+}\PY{o}{+}\PY{n}{end}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp task}
\PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{middle}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp task}
\PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{o}{+}\PY{o}{+}\PY{n}{middle}\PY{p}{,} \PY{o}{+}\PY{o}{+}\PY{n}{end}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k+kt}{long} \PY{n}{nthreshold} \PY{o}{=} \PY{n}{ceil}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)} \PY{o}{/} \PY{l+m+mi}{2}\PY{p}{;}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel}
\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp single nowait}
\PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{end}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;}
\PY{p}{\PYZcb{}}
\end{Verbatim}
\end{block}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Sort times of custom algorithms}
\begin{center}
\includegraphics[height=5cm]{sort2.png}
\end{center}
\begin{exampleblock}{Note}
\small
Container size is 6M - miserable...
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Two quicksort approach to }
\begin{columns}[t]
\column{1.7in}
\begin{block}{Treshold}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,}
\PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{long} \PY{n}{nthreshold}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{c+c1}{// parition ...}
\PY{k}{if} \PY{p}{(}\PY{n}{end}\PY{o}{-}\PY{n}{begin}\PY{o}{+}\PY{l+m+mi}{1} \PY{o}{<}\PY{o}{=} \PY{n}{nthreshold}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{c+c1}{// serial sort ...}
\PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}}
\PY{c+c1}{// parallel sort ...}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{k+kt}{long} \PY{n}{deep} \PY{o}{=}
\PY{n}{ceil}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)} \PY{o}{/} \PY{l+m+mi}{2}\PY{p}{;}
\end{Verbatim}
\end{block}
\column{1.5in}
\begin{block}{Depth}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,}
\PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,}
\PY{k}{const} \PY{k+kt}{int} \PY{n}{deep}\PY{p}{)}
\PY{p}{\PYZob{}}
\PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{c+c1}{// parition ...}
\PY{k}{if} \PY{p}{(}\PY{n}{deep}\PY{p}{)} \PY{p}{\PYZob{}}
\PY{c+c1}{// serial sort ...}
\PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}}
\PY{c+c1}{// parallel sort with deep-1}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{p}{\PYZcb{}}
\PY{k+kt}{long} \PY{n}{deep} \PY{o}{=} \PY{l+m+mi}{15}\PY{p}{;}
\end{Verbatim}
\end{block}
\end{columns}
\begin{exampleblock}{Note}
\small
Depth seems simpler yet faster.
\end{exampleblock}
\end{frame}
\section{Final thoughts}
%----------- slide --------------------------------------------------%
\subsection{Grainsize}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Chunk size}
\begin{center}
\includegraphics[height=5cm]{chunksize.png}
\end{center}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item Unit is loop interaction per chunk. Default value is 1.
\item Too small chunks can introduce more overhead than useful work.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Grain size}
\begin{center}
\includegraphics[height=5cm]{grainsize.png}
\end{center}
\begin{exampleblock}{Note}
\small
\begin{itemize}
\item Unit is CPU cycles.
\item Should be at least ~100.000.
\end{itemize}
\end{exampleblock}
\end{frame}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{Task stealing - Intel TBB}
\begin{block}{Task stealing}
\begin{itemize}
\item Each thread has a queue of tasks.
\item If a thread has no more tasks then it ``steals'' from another.
\item Think about tasks, not about threads when programming.
\end{itemize}
\end{block}
\begin{exampleblock}{Threadpool}
A threadpool with a commond concurrent queue of tasks is a common practice in networking servers.
\end{exampleblock}
\begin{exampleblock}{Work stealing}
Another implementation is Cilk\cite{cilk} - where each processor has a stack of frames.
\end{exampleblock}
\end{frame}
\subsection{Convolution}
%----------- slide --------------------------------------------------%
\begin{frame}[fragile]{1D gaussian filter}
\begin{columns}[t]
\column{2.5in}
\begin{block}{c++ code}
\tiny
\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1]
\PY{k+kt}{void} \PY{n}{serialConvolution}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{output}\PY{p}{,}
\PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}