diff --git a/etc/bins/1d_gauss.png b/etc/bins/1d_gauss.png new file mode 100644 index 0000000..0ffcefa Binary files /dev/null and b/etc/bins/1d_gauss.png differ diff --git a/etc/bins/chunksize.png b/etc/bins/chunksize.png new file mode 100755 index 0000000..f69f0f5 Binary files /dev/null and b/etc/bins/chunksize.png differ diff --git a/etc/bins/convolution.png b/etc/bins/convolution.png new file mode 100644 index 0000000..dc73d86 Binary files /dev/null and b/etc/bins/convolution.png differ diff --git a/etc/bins/grainsize.png b/etc/bins/grainsize.png new file mode 100755 index 0000000..7c80561 Binary files /dev/null and b/etc/bins/grainsize.png differ diff --git a/etc/bins/map.png b/etc/bins/map.png new file mode 100644 index 0000000..aa6cc9d Binary files /dev/null and b/etc/bins/map.png differ diff --git a/etc/bins/parallel_programming_in_cpp.pdf b/etc/bins/parallel_programming_in_cpp.pdf new file mode 100644 index 0000000..1109740 Binary files /dev/null and b/etc/bins/parallel_programming_in_cpp.pdf differ diff --git a/etc/bins/reduce.png b/etc/bins/reduce.png new file mode 100644 index 0000000..58e9c95 Binary files /dev/null and b/etc/bins/reduce.png differ diff --git a/etc/bins/sort1.png b/etc/bins/sort1.png new file mode 100644 index 0000000..25cc4da Binary files /dev/null and b/etc/bins/sort1.png differ diff --git a/etc/bins/sort2.png b/etc/bins/sort2.png new file mode 100644 index 0000000..8d6ec71 Binary files /dev/null and b/etc/bins/sort2.png differ diff --git a/presentation/gnuplot/1d_gauss.gp b/presentation/gnuplot/1d_gauss.gp new file mode 100644 index 0000000..1e390cc --- /dev/null +++ b/presentation/gnuplot/1d_gauss.gp @@ -0,0 +1,31 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +# set style data lines +set grid + +set xlabel 'X' +set yrange [0:0.383] + +set ylabel 'Y' +set xrange [-4:4] + +set terminal png transparent nocrop medium size 200,200 +set output '1d_gauss.png' + +plot \ +'-' using 1:($2) title '1D discrete gaussian filter' axes x1y1 lt rgb 'blue' lw 2 +-4;0 +-3;0.006 +-2;0.061 +-1;0.242 +0;0.383 +1;0.242 +2;0.061 +3;0.006 +4;0 +e \ No newline at end of file diff --git a/presentation/gnuplot/convolution.gp b/presentation/gnuplot/convolution.gp new file mode 100644 index 0000000..9012bfb --- /dev/null +++ b/presentation/gnuplot/convolution.gp @@ -0,0 +1,66 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +set style data lines +set grid + +set xlabel 'Number of threads' +set yrange [0:0.53] +set ylabel 'Executions time [s]' + +set terminal png transparent nocrop medium size 800,400 +set output 'convolution.png' + +plot \ +'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \ +'-' using 1:($2) title 'Serial - optimized' axes x1y1 lt rgb 'red' lw 2, \ +'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \ +'-' using 1:($2) title 'intel TBB - optimized' axes x1y1 lt rgb 'dark-green' lw 2 +1;0.33 +2;0.33 +3;0.33 +4;0.33 +5;0.33 +6;0.33 +7;0.33 +8;0.33 +9;0.33 +10;0.33 +e +1;0.25 +2;0.25 +3;0.25 +4;0.25 +5;0.25 +6;0.25 +7;0.25 +8;0.25 +9;0.25 +10;0.25 +e +1;0.52 +2;0.26 +3;0.27 +4;0.14 +5;0.14 +6;0.13 +7;0.13 +8;0.12 +9;0.13 +10;0.12 +e +1;0.25 +2;0.125 +3;0.13 +4;0.067 +5;0.067 +6;0.063 +7;0.063 +8;0.0577 +9;0.063 +10;0.0577 +e diff --git a/presentation/gnuplot/map.gp b/presentation/gnuplot/map.gp new file mode 100644 index 0000000..98c9cfd --- /dev/null +++ b/presentation/gnuplot/map.gp @@ -0,0 +1,67 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +set style data lines +set grid + +set xlabel 'Number of threads' +set yrange [0:6.5] +set ylabel 'Executions time [s]' + +set terminal png transparent nocrop medium size 800,400 +set output 'map.png' + +plot \ +'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \ +'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \ +'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \ +'-' using 1:($2) title 'QtConcurrent' axes x1y1 lt rgb 'dark-green' lw 2 +1;0.5 +2;0.67 +3;0.68 +4;0.69 +5;0.68 +6;0.7 +7;0.65 +8;0.72 +9;0.68 +10;0.69 +e +1;6.12 +2;3.41 +3;2.12 +4;1.61 +5;1.27 +6;1.59 +7;1.17 +8;1.38 +9;0.97 +10;0.87 +e +1;6.25 +2;3.45 +3;3.17 +4;1.62 +5;1.6 +6;1.6 +7;1.6 +8;1.51 +9;1.66 +10;1.32 +e +1;3.22 +2;2.2 +3;1.65 +4;1.32 +5;1.43 +6;1.44 +7;0.95 +8;0.9 +9;0.85 +10;0.8 +e + diff --git a/presentation/gnuplot/reduce.gp b/presentation/gnuplot/reduce.gp new file mode 100644 index 0000000..d9ef948 --- /dev/null +++ b/presentation/gnuplot/reduce.gp @@ -0,0 +1,67 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +set style data lines +set grid + +set xlabel 'Number of threads' +set yrange [0:0.8] +set ylabel 'Executions time [s]' + +set terminal png transparent nocrop medium size 800,400 +set output 'reduce.png' + +plot \ +'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \ +'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \ +'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \ +'-' using 1:($2) title 'QtConcurrent' axes x1y1 lt rgb 'dark-green' lw 2 +1;0.67 +2;0.67 +3;0.61 +4;0.68 +5;0.59 +6;0.7 +7;0.67 +8;0.7 +9;0.65 +10;0.68 +e +1;0.52 +2;0.27 +3;0.19 +4;0.15 +5;0.15 +6;0.15 +7;0.22 +8;0.2 +9;0.17 +10;0.16 +e +1;0.68 +2;0.35 +3;0.35 +4;0.18 +5;0.18 +6;0.18 +7;0.19 +8;0.18 +9;0.18 +10;0.15 +e +1;0.25 +2;0.18 +3;0.15 +4;0.15 +5;0.15 +6;0.17 +7;0.17 +8;0.16 +9;0.17 +10;0.17 +e + diff --git a/presentation/gnuplot/sort1.gp b/presentation/gnuplot/sort1.gp new file mode 100644 index 0000000..e4fe144 --- /dev/null +++ b/presentation/gnuplot/sort1.gp @@ -0,0 +1,54 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +set style data lines +set grid + +set xlabel 'Number of threads' +set yrange [0:3.5] +set ylabel 'Executions time [s]' + +set terminal png transparent nocrop medium size 800,400 +set output 'sort1.png' + +plot \ +'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \ +'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \ +'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2 +1;2.33 +2;3.39 +3;3.05 +4;3.19 +5;2.98 +6;2.99 +7;3.15 +8;3.24 +9;3.09 +10;2.99 +e +1;3.04 +2;1.6 +3;1.66 +4;0.84 +5;0.71 +6;0.65 +7;0.7 +8;0.61 +9;0.56 +10;0.51 +e +1;3.33 +2;1.81 +3;1.32 +4;1.7 +5;1.26 +6;0.89 +7;0.87 +8;0.83 +9;0.81 +10;0.82 +e \ No newline at end of file diff --git a/presentation/gnuplot/sort2.gp b/presentation/gnuplot/sort2.gp new file mode 100644 index 0000000..97b1b29 --- /dev/null +++ b/presentation/gnuplot/sort2.gp @@ -0,0 +1,90 @@ +#!/usr/bin/gnuplot + +#input +set datafile separator ";" + +#output +set key top left +set style data lines +set grid + +set xlabel 'Number of threads' +set yrange [0:18] +set ylabel 'Executions time [s]' + +set terminal png transparent nocrop medium size 800,400 +set output 'sort2.png' + +plot \ +'-' using 1:($2) title 'Serial' axes x1y1 lt rgb 'black' lw 2, \ +'-' using 1:($2) title 'openMP' axes x1y1 lt rgb 'red' lw 2, \ +'-' using 1:($2) title 'intel TBB' axes x1y1 lt rgb 'dark-blue' lw 2, \ +'-' using 1:($2) title 'cusom openMP threashold' axes x1y1 lt rgb 'steelblue' lw 2, \ +'-' using 1:($2) title 'cusom openMP deep' axes x1y1 lt rgb 'olivedrab' lw 2, \ +'-' using 1:($2) title 'custom QtConcurrent' axes x1y1 lt rgb 'orangered' lw 2 +1;0.29 +2;0.29 +3;0.29 +4;0.29 +5;0.29 +6;0.29 +7;0.29 +8;0.29 +9;0.29 +10;0.29 +e +1;0.29 +2;0.16 +3;0.11 +4;0.82 +5;0.7 +6;0.59 +7;0.66 +8;0.6 +9;0.56 +10;0.51 +e +1;0.29 +2;0.18 +3;0.13 +4;0.69 +5;0.94 +6;0.87 +7;0.83 +8;0.8 +9;0.84 +10;0.82 +e +1;14.32 +2;11.67 +3;6.61 +4;5.69 +5;5.64 +6;4.24 +7;3.71 +8;3.56 +9;3.76 +10;3.18 +e +1;17.62 +2;8.64 +3;5.92 +4;4.45 +5;4.07 +6;3.18 +7;3.12 +8;3.11 +9;3.05 +10;3.1 +e +1;17.39 +2;8.91 +3;7.93 +4;5.74 +5;4.64 +6;4.19 +7;3.97 +8;4.68 +9;4.85 +10;4 +e diff --git a/presentation/latex/colordefs.tex b/presentation/latex/colordefs.tex new file mode 100644 index 0000000..1316dc2 --- /dev/null +++ b/presentation/latex/colordefs.tex @@ -0,0 +1,84 @@ +\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax% + \let\PY@ul=\relax \let\PY@tc=\relax% + \let\PY@bc=\relax \let\PY@ff=\relax} +\def\PY@tok#1{\csname PY@tok@#1\endcsname} +\def\PY@toks#1+{\ifx\relax#1\empty\else% + \PY@tok{#1}\expandafter\PY@toks\fi} +\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{% + \PY@it{\PY@bf{\PY@ff{#1}}}}}}} +\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}} + +\def\PY@tok@gd{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}} +\def\PY@tok@gu{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}} +\def\PY@tok@gt{\def\PY@tc##1{\textcolor[rgb]{0.00,0.25,0.82}{##1}}} +\def\PY@tok@gs{\let\PY@bf=\textbf} +\def\PY@tok@gr{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}} +\def\PY@tok@cm{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\def\PY@tok@vg{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\def\PY@tok@m{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@mh{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@go{\def\PY@tc##1{\textcolor[rgb]{0.50,0.50,0.50}{##1}}} +\def\PY@tok@ge{\let\PY@it=\textit} +\def\PY@tok@vc{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\def\PY@tok@il{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@cs{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\def\PY@tok@cp{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}} +\def\PY@tok@gi{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}} +\def\PY@tok@gh{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}} +\def\PY@tok@ni{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}} +\def\PY@tok@nl{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}} +\def\PY@tok@nn{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\def\PY@tok@no{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}} +\def\PY@tok@na{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}} +\def\PY@tok@nb{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@nc{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\def\PY@tok@nd{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}} +\def\PY@tok@ne{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}} +\def\PY@tok@nf{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\def\PY@tok@si{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}} +\def\PY@tok@s2{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@vi{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\def\PY@tok@nt{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@nv{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\def\PY@tok@s1{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@sh{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@sc{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@sx{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@bp{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@c1{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\def\PY@tok@kc{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@c{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\def\PY@tok@mf{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@err{\def\PY@bc##1{\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{##1}}} +\def\PY@tok@kd{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@ss{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\def\PY@tok@sr{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}} +\def\PY@tok@mo{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@kn{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@mi{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@gp{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}} +\def\PY@tok@o{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\def\PY@tok@kr{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@s{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@kp{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@w{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}} +\def\PY@tok@kt{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}} +\def\PY@tok@ow{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}} +\def\PY@tok@sb{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\def\PY@tok@k{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\def\PY@tok@se{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}} +\def\PY@tok@sd{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} + +\def\PYZbs{\char`\\} +\def\PYZus{\char`\_} +\def\PYZob{\char`\{} +\def\PYZcb{\char`\}} +\def\PYZca{\char`\^} +\def\PYZsh{\char`\#} +\def\PYZpc{\char`\%} +\def\PYZdl{\char`\$} +\def\PYZti{\char`\~} +% for compatibility with earlier versions +\def\PYZat{@} +\def\PYZlb{[} +\def\PYZrb{]} \ No newline at end of file diff --git a/presentation/latex/parallel_programming_in_cpp.tex b/presentation/latex/parallel_programming_in_cpp.tex new file mode 100644 index 0000000..ef85fd8 --- /dev/null +++ b/presentation/latex/parallel_programming_in_cpp.tex @@ -0,0 +1,988 @@ +\documentclass{beamer} + +\usepackage[utf8x]{inputenc} +\usepackage{fancyvrb} +\usepackage{color} +\usepackage{graphicx} + +\usetheme{Darmstadt} + +\title {High-level parallel programming in C++} +\author{Dénes Mátételki} +\institute{www.emerson.com} +\date{March 18, 2012} + +\makeatletter +\include{colordefs} +\makeatother + +\begin{document} + +%----------- slide --------------------------------------------------% + +\begin{frame} +\titlepage +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame} +\frametitle{Table of contents} +\tableofcontents +\end{frame} + +\section{Theory} + +\subsection{High level vs. low level} + +%----------- slide --------------------------------------------------% + +\begin{frame}{Comparison} +\begin{columns}[t] +\column{1.5in} + +\begin{block}{High level} +\small +\begin{itemize} + \item Auto scaling-up + \item Threadpool handling, load balancing. + \item Synchronization and mutexes are handled. +\end{itemize} +\end{block} + +\column{1.5in} + +\begin{block}{Low level} +\small +\begin{itemize} + \item Manual thread creation. + \item Manual joins and mutex handling. + \item Better for event and I/O based threading. + \item Compiler and external library independend. +\end{itemize} +\end{block} + +\end{columns} +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}{Compared softwares (performance, code complexity)} +\begin{columns}[t] +\column{1.5in} + +\begin{block}{Used} +\small +\begin{itemize} + \item Standard c++ (serial examples) + \item openMP\cite{openmp} + \item Intel Thread Building Blocks (TBB)\cite{itbb} + \item QtConcurrent\cite{qtconcurrent} +\end{itemize} +\end{block} + +\column{1.5in} + +\begin{block}{Skipped} +\small +\begin{itemize} + \item std::thread, std::mutex (c++0x)\cite{cpp_thread} + \item POSIX threads\cite{posix_threads} + \item QThread\cite{qt_thread} +\end{itemize} +\end{block} + +\end{columns} + + +\begin{exampleblock}{Co-existence\cite{itbb_openmp_nativethreads}} +\small +Possible, but the separate threadpools can lead to oversubscription. +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}{Comparison} + +\begin{columns}[t] +\column{1.5in} + +\begin{block}{openMP} +\small +\begin{itemize} + \item Compiler support needed. + \item C, C++, fortran. + \item Best for bounded loops. + \item No need for big code re-write. + \item Hard to debug. + \item Managed by a non-profit organization. +\end{itemize} +\end{block} +\column{1.5in} + + +\begin{block}{Intel TBB} +\small +\begin{itemize} + \item Object oriented. + \item Concurrent data types. + \item Parallel algorithms. + \item Work stealing: dynamic load sharing. + \item Relies heavily on templates. + \item Heavy code rewrite is needed. +\end{itemize} +\end{block} + +\column{1.5in} + +\begin{block}{QtConcurrent} +\small +\begin{itemize} + \item Object oriented + \item Limited number of algorithms. + \item ... +\end{itemize} +\end{block} + +\end{columns} + +\end{frame} + + +\subsection{Algorithms} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Used algorithms for testing} + +\begin{block}{List} +\begin{itemize} + \item Map - Applies a given function to each element of a container. + \item Reduction - Combines the results of sub-parts. + \item Sort - Puts elements of a list in a certain order. +\end{itemize} +\end{block} + +\begin{exampleblock}{Note} +\small +\begin{itemize} + \item The used container is an \verb|std::vector| + \item Container size was 60 million with random floats [1, 1000] + \item Execution times are the avareges of 3 executions. + \item Used hardware was an Intel Xeon 64-bit machine with 6 cores (12 threads), 3,4Mz. + \item Compiled with gcc-4.4 and use flags: \verb|-O3| \verb|-ffast-math| \verb|-fwhole-program| +\verb|-fomit-frame-pointer| \verb|-march=native| \verb|-m64| +\end{itemize} +\end{exampleblock} + + +\end{frame} + + +\section{Code samples} + +\subsection{Map} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Serial map} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{float} \PY{n}{modify}\PY{p}{(}\PY{k+kt}{float} \PY{n}{value}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{return} \PY{l+m+mf}{13.37} \PY{o}{*} \PY{n}{pow}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{value}\PY{p}{)}\PY{p}{,} \PY{n}{log}\PY{p}{(}\PY{n}{value}\PY{p}{)}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} + + +\PY{k+kt}{void} \PY{n}{serialMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{n}{modify}\PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} +\end{block} + +\begin{exampleblock}{Note} + \small + \begin{itemize} + \item ``chunksize'' equals the size of the data. + \item This modify function will be used by the parallel examples too. + \end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{openMP parallel map} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{openMpMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{size\PYZus{}t} \PY{n}{i}\PY{p}{;} + +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel for \PYZbs{}} +\PY{c+cp}{ default(shared) private(i) \PYZbs{}} +\PY{c+cp}{ schedule(dynamic, chunkSize) \PYZbs{}} +\PY{c+cp}{ num\PYZus{}threads(numberOfThreads)} + + \PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{modify}\PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\begin{exampleblock}{Note} +\small +Making it run in parallel is just a single pragma line. +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Intel TBB map} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k}{class} \PY{n+nc}{itbbMap} \PY{p}{\PYZob{}} +\PY{k}{public}\PY{o}{:} + + \PY{n}{itbbMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} + \PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{data}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}} + + \PY{k+kt}{void} \PY{k}{operator}\PY{p}{(}\PY{p}{)}\PY{p}{(}\PY{k}{const} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{o}{&} \PY{n}{r}\PY{p}{)} \PY{k}{const} \PY{p}{\PYZob{}} + \PY{k}{for}\PY{p}{(} \PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i} \PY{o}{!}\PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+} \PY{p}{)} + \PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{modify}\PY{p}{(}\PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)}\PY{p}{;} + \PY{p}{\PYZcb{}} + +\PY{k}{private}\PY{o}{:} + \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{m\PYZus{}data}\PY{p}{;} +\PY{p}{\PYZcb{}}\PY{p}{;} + + +\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{task\PYZus{}scheduler\PYZus{}init} \PY{n}{init}\PY{p}{(}\PY{n}{NUMBER\PYZus{}OF\PYZus{}THREADS}\PY{p}{)}\PY{p}{;} +\PY{n}{itbbMap} \PY{n}{im}\PY{p}{(}\PY{n}{data}\PY{p}{)}\PY{p}{;} +\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}for}\PY{p}{(}\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{CHUNK\PYZus{}SIZE}\PY{p}{)}\PY{p}{,} \PY{n}{im}\PY{p}{)}\PY{p}{;} +\end{Verbatim} + +\end{block} + +\begin{exampleblock}{Note} +\small +Running a functor on chunks in parallel. +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{QtConcurrent map} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{QtMap}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{blockingMap}\PY{p}{(}\PY{n}{data}\PY{p}{,} \PY{n}{modify}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} + +\PY{n}{QThreadPool}\PY{o}{:}\PY{o}{:}\PY{n}{globalInstance}\PY{p}{(}\PY{p}{)}\PY{o}{-}\PY{o}{>}\PY{n}{setMaxThreadCount}\PY{p}{(}\PY{n}{NUMBER\PYZus{}OF\PYZus{}THREADS}\PY{p}{)}\PY{p}{;} +\end{Verbatim} + + +\end{block} + +\begin{exampleblock}{Note} + \small + \begin{itemize} + \item Chunksize is 1. + \item Blocks till the iterator reaches the end. + \end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Map execution times} + +\begin{center} +\includegraphics[height=5cm]{map.png} +\end{center} + +\begin{exampleblock}{Note} +\small +Serial remained the fastest (memory bound?) - No need to paralellize. +\end{exampleblock} + +\end{frame} + +\subsection{Reduce} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Serial reduce} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{float} \PY{n}{serialReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;} + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{k}{if} \PY{p}{(}\PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min}\PY{p}{)} + \PY{n}{min} \PY{o}{=} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;} + + \PY{k}{return} \PY{n}{min}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\begin{exampleblock}{Note} + \small + \begin{itemize} + \item Minimum value search. + \item Not actually a reduce. + \item Following examples will try to achive this too. + \end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{openMP reduce} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{int} \PY{n}{openMpReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{size\PYZus{}t} \PY{n}{i}\PY{p}{;} + \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>} \PY{n}{separate\PYZus{}results}\PY{p}{(}\PY{n}{numberOfThreads}\PY{p}{,} \PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;} + +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel \PYZbs{}} +\PY{c+cp}{ default(shared) private(i) \PYZbs{}} +\PY{c+cp}{ num\PYZus{}threads(numberOfThreads)} + \PY{p}{\PYZob{}} + \PY{k+kt}{int} \PY{n}{threadId} \PY{o}{=} \PY{n}{omp\PYZus{}get\PYZus{}thread\PYZus{}num}\PY{p}{(}\PY{p}{)}\PY{p}{;} + +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp for schedule(dynamic, chunkSize)} + + \PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{k}{if} \PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{threadId}\PY{p}{]} \PY{o}{<} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{)} + \PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{threadId}\PY{p}{]} \PY{o}{=} \PY{n}{data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;} + \PY{p}{\PYZcb{}} + + \PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;} + \PY{k}{for} \PY{p}{(}\PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{numberOfThreads}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{k}{if} \PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min}\PY{p}{)} + \PY{n}{min} \PY{o}{=} \PY{n}{separate\PYZus{}results}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;} + + \PY{k}{return} \PY{n}{min}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + + +\end{block} + +\end{frame} + + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Intel TBB reduce} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k}{class} \PY{n+nc}{itbbReduce} \PY{p}{\PYZob{}} + \PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{m\PYZus{}data}\PY{p}{;} +\PY{k}{public}\PY{o}{:} + \PY{k+kt}{float} \PY{n}{m\PYZus{}min}\PY{p}{;} + + \PY{n}{itbbReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} \PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{data}\PY{p}{)} \PY{p}{,} \PY{n}{m\PYZus{}min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}} + \PY{n}{itbbReduce}\PY{p}{(}\PY{n}{itbbReduce}\PY{o}{&} \PY{n}{other}\PY{p}{,} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{split}\PY{p}{)} \PY{o}{:} \PY{n}{m\PYZus{}data}\PY{p}{(}\PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}data}\PY{p}{)}\PY{p}{,} \PY{n}{m\PYZus{}min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)} \PY{p}{\PYZob{}}\PY{p}{\PYZcb{}} + + \PY{k+kt}{void} \PY{k}{operator}\PY{p}{(}\PY{p}{)}\PY{p}{(}\PY{k}{const} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{o}{&} \PY{n}{r}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{k+kt}{float} \PY{n}{min} \PY{o}{=} \PY{n}{m\PYZus{}min}\PY{p}{;} + \PY{k}{for}\PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i} \PY{o}{!}\PY{o}{=} \PY{n}{r}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{k}{if} \PY{p}{(} \PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{<} \PY{n}{min} \PY{p}{)} + \PY{n}{min} \PY{o}{=} \PY{n}{m\PYZus{}data}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;} + + \PY{n}{m\PYZus{}min} \PY{o}{=} \PY{n}{min}\PY{p}{;} + \PY{p}{\PYZcb{}} + + \PY{k+kt}{void} \PY{n}{join}\PY{p}{(}\PY{k}{const} \PY{n}{itbbReduce}\PY{o}{&} \PY{n}{other}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(} \PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}min} \PY{o}{<} \PY{n}{m\PYZus{}min} \PY{p}{)} + \PY{n}{m\PYZus{}min} \PY{o}{=} \PY{n}{other}\PY{p}{.}\PY{n}{m\PYZus{}min}\PY{p}{;} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}}\PY{p}{;} + +\PY{n}{itbbReduce} \PY{n}{mif}\PY{p}{(}\PY{n}{data}\PY{p}{)}\PY{p}{;} +\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}reduce}\PY{p}{(}\PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{CHUNK\PYZus{}SIZE}\PY{p}{)}\PY{p}{,} \PY{n}{mif}\PY{p}{)}\PY{p}{;} +\PY{k+kt}{float} \PY{n}{min} \PY{o}{=} \PY{n}{mif}\PY{p}{.}\PY{n}{m\PYZus{}min}\PY{p}{;} +\end{Verbatim} + +\end{block} + +\end{frame} +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{QtConcurrent reduce} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{findMinimum}\PY{p}{(}\PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{:}\PY{o}{:}\PY{n}{const\PYZus{}iterator} \PY{n}{begin}\PY{p}{,} + \PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{:}\PY{o}{:}\PY{n}{const\PYZus{}iterator} \PY{n}{end}\PY{p}{,} + \PY{k+kt}{float} \PY{o}{*}\PY{n}{result}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{result} \PY{o}{=} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{min\PYZus{}element}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{end}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} + + +\PY{k+kt}{float} \PY{n}{QtReduce}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{numberOfThreads}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{chunkSize}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>} \PY{n}{separate\PYZus{}results}\PY{p}{(}\PY{n}{numberOfThreads}\PY{p}{,} \PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;} + \PY{n}{QFutureSynchronizer}\PY{o}{<}\PY{k+kt}{void}\PY{o}{>} \PY{n}{synchronizer}\PY{p}{;} + + \PY{k}{for}\PY{p}{(}\PY{k+kt}{int} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{numberOfThreads}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{n}{synchronizer}\PY{p}{.}\PY{n}{addFuture}\PY{p}{(}\PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{findLocalMinimum}\PY{p}{,} + \PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{n}{i}\PY{o}{*}\PY{n}{chunkSize}\PY{p}{,} + \PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{p}{(}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{1}\PY{p}{)}\PY{o}{*}\PY{n}{chunkSize}\PY{p}{,} + \PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{data}\PY{p}{(}\PY{p}{)}\PY{o}{+}\PY{n}{i}\PY{p}{)}\PY{p}{)}\PY{p}{;} + + \PY{n}{synchronizer}\PY{p}{.}\PY{n}{waitForFinished}\PY{p}{(}\PY{p}{)}\PY{p}{;} + + \PY{k+kt}{float} \PY{n}{min}\PY{p}{(}\PY{n}{FLT\PYZus{}MAX}\PY{p}{)}\PY{p}{;} + \PY{n}{findMinimum}\PY{p}{(}\PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{separate\PYZus{}results}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{min}\PY{p}{)}\PY{p}{;} + \PY{k}{return} \PY{n}{min}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Reduce execution times} + +\begin{center} +\includegraphics[height=5cm]{reduce.png} +\end{center} + +\begin{exampleblock}{Note} +\small +No need for more than 4 threads. +\end{exampleblock} + +\end{frame} + +\subsection{Sort} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Serial sort} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{serialSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\begin{exampleblock}{Note: quicksort} + \small + \begin{itemize} + \item Pick a pivot point. + \item Partition: Swap elements compared to pivot point. + \item Recursively calls itself with the 2 new partitions. + \end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{openMP, Intel TBB sort} + +\begin{block}{openMP c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{include } + +\PY{k+kt}{void} \PY{n}{openMpSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{\PYZus{}\PYZus{}gnu\PYZus{}parallel}\PY{o}{:}\PY{o}{:}\PY{n}{sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\begin{exampleblock}{Note} +\small +Some algorithms are already rewritten to work in parallel with openMP. +\end{exampleblock} + +\begin{block}{Intel TBB c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{itbbSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{parallel\PYZus{}sort}\PY{p}{(}\PY{n}{data}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{end}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Sort execution times} + +\begin{center} +\includegraphics[height=5cm]{sort1.png} +\end{center} + +\begin{exampleblock}{Note} +\small +No need for more than 6 threads. +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Custom QtConcurrent sort} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>} +\PY{k+kt}{long} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{SortType} \PY{n}{outputArray}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k+kt}{long} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}} + +\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>} +\PY{k+kt}{void} \PY{n}{QsSequential}\PY{p}{(}\PY{n}{SortType} \PY{n}{array}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}} + +\PY{k}{template} \PY{o}{<}\PY{k}{class} \PY{n+nc}{SortType}\PY{o}{>} +\PY{k+kt}{void} \PY{n}{QuickSortTask} \PY{p}{(}\PY{n}{SortType} \PY{n}{array}\PY{p}{[}\PY{p}{]}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{left}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{right}\PY{p}{,} \PY{k}{const} \PY{k+kt}{int} \PY{n}{deep}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(}\PY{n}{left} \PY{o}{<} \PY{n}{right}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(}\PY{n}{deep}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{k}{const} \PY{k+kt}{long} \PY{n}{part} \PY{o}{=} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{right}\PY{p}{)}\PY{p}{;} + \PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{n}{SortType}\PY{o}{>}\PY{p}{,} \PY{n}{array}\PY{p}{,} \PY{n}{part} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{right}\PY{p}{,} \PY{n}{deep} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;} + \PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{n}{SortType}\PY{o}{>}\PY{p}{,} \PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{part} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{deep} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;} + \PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}} + \PY{k}{const} \PY{k+kt}{long} \PY{n}{part} \PY{o}{=} \PY{n}{QsPartition}\PY{p}{(}\PY{n}{array}\PY{p}{,} \PY{n}{left}\PY{p}{,} \PY{n}{right}\PY{p}{)}\PY{p}{;} + \PY{n}{QsSequential}\PY{p}{(}\PY{n}{array}\PY{p}{,}\PY{n}{part} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{,}\PY{n}{right}\PY{p}{)}\PY{p}{;} + \PY{n}{QsSequential}\PY{p}{(}\PY{n}{array}\PY{p}{,}\PY{n}{left}\PY{p}{,}\PY{n}{part} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{;} + \PY{p}{\PYZcb{}} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} + +\PY{k+kt}{void} \PY{n}{QtSort}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{data}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{n}{QtConcurrent}\PY{o}{:}\PY{o}{:}\PY{n}{run}\PY{p}{(}\PY{n}{QuickSortTask}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{data}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{,} \PY{n}{data}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)} \PY{o}{-} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{)}\PY{p}{;} + \PY{n}{QThreadPool}\PY{o}{:}\PY{o}{:}\PY{n}{globalInstance}\PY{p}{(}\PY{p}{)}\PY{o}{-}\PY{o}{>}\PY{n}{waitForDone}\PY{p}{(}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Custom openMP sort} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}} + +\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} \PY{p}{.}\PY{p}{.}\PY{p}{.} \PY{p}{\PYZcb{}} + +\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,} \PY{k}{const} \PY{k+kt}{long} \PY{n}{nthreshold}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{c+c1}{// parition ...} + \PY{k}{if} \PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1} \PY{o}{<}\PY{o}{=} \PY{n}{nthreshold}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{middle}\PY{p}{)}\PY{p}{;} + \PY{n}{sample\PYZus{}qsort\PYZus{}serial}\PY{p}{(}\PY{o}{+}\PY{o}{+}\PY{n}{middle}\PY{p}{,} \PY{o}{+}\PY{o}{+}\PY{n}{end}\PY{p}{)}\PY{p}{;} + \PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}} +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp task} + \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{middle}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;} +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp task} + \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{o}{+}\PY{o}{+}\PY{n}{middle}\PY{p}{,} \PY{o}{+}\PY{o}{+}\PY{n}{end}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;} + \PY{p}{\PYZcb{}} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} + +\PY{k+kt}{void} \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k+kt}{long} \PY{n}{nthreshold} \PY{o}{=} \PY{n}{ceil}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)} \PY{o}{/} \PY{l+m+mi}{2}\PY{p}{;} +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp parallel} +\PY{c+cp}{\PYZsh{}}\PY{c+cp}{pragma omp single nowait} + \PY{n}{sample\PYZus{}qsort\PYZus{}adaptive}\PY{p}{(}\PY{n}{begin}\PY{p}{,} \PY{n}{end}\PY{p}{,} \PY{n}{nthreshold}\PY{p}{)}\PY{p}{;} +\PY{p}{\PYZcb{}} +\end{Verbatim} + +\end{block} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Sort times of custom algorithms} + +\begin{center} +\includegraphics[height=5cm]{sort2.png} +\end{center} + + +\begin{exampleblock}{Note} +\small +Container size is 6M - miserable... +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Two quicksort approach to } + +\begin{columns}[t] + +\column{1.7in} +\begin{block}{Treshold} +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} + \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{long} \PY{n}{nthreshold}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{c+c1}{// parition ...} + \PY{k}{if} \PY{p}{(}\PY{n}{end}\PY{o}{-}\PY{n}{begin}\PY{o}{+}\PY{l+m+mi}{1} \PY{o}{<}\PY{o}{=} \PY{n}{nthreshold}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{c+c1}{// serial sort ...} + \PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}} + \PY{c+c1}{// parallel sort ...} + \PY{p}{\PYZcb{}} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} + +\PY{k+kt}{long} \PY{n}{deep} \PY{o}{=} + \PY{n}{ceil}\PY{p}{(}\PY{n}{sqrt}\PY{p}{(}\PY{n}{end} \PY{o}{-} \PY{n}{begin} \PY{o}{+} \PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)} \PY{o}{/} \PY{l+m+mi}{2}\PY{p}{;} +\end{Verbatim} + +\end{block} + +\column{1.5in} +\begin{block}{Depth} +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{qsort}\PY{p}{(}\PY{k+kt}{float}\PY{o}{*} \PY{n}{begin}\PY{p}{,} + \PY{k+kt}{float}\PY{o}{*} \PY{n}{end}\PY{p}{,} + \PY{k}{const} \PY{k+kt}{int} \PY{n}{deep}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{k}{if} \PY{p}{(}\PY{n}{begin} \PY{o}{!}\PY{o}{=} \PY{n}{end}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{c+c1}{// parition ...} + \PY{k}{if} \PY{p}{(}\PY{n}{deep}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{c+c1}{// serial sort ...} + \PY{p}{\PYZcb{}} \PY{k}{else} \PY{p}{\PYZob{}} + \PY{c+c1}{// parallel sort with deep-1} + \PY{p}{\PYZcb{}} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} + +\PY{k+kt}{long} \PY{n}{deep} \PY{o}{=} \PY{l+m+mi}{15}\PY{p}{;} +\end{Verbatim} +\end{block} + + +\end{columns} + +\begin{exampleblock}{Note} +\small +Depth seems simpler yet faster. +\end{exampleblock} + +\end{frame} + + +\section{Final thoughts} + + +%----------- slide --------------------------------------------------% + +\subsection{Grainsize} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Chunk size} + +\begin{center} +\includegraphics[height=5cm]{chunksize.png} +\end{center} + +\begin{exampleblock}{Note} +\small +\begin{itemize} + \item Unit is loop interaction per chunk. Default value is 1. + \item Too small chunks can introduce more overhead than useful work. +\end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Grain size} + +\begin{center} +\includegraphics[height=5cm]{grainsize.png} +\end{center} + +\begin{exampleblock}{Note} +\small +\begin{itemize} + \item Unit is CPU cycles. + \item Should be at least ~100.000. +\end{itemize} +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Task stealing - Intel TBB} + +\begin{block}{Task stealing} +\begin{itemize} + \item Each thread has a queue of tasks. + \item If a thread has no more tasks then it ``steals'' from another. + \item Think about tasks, not about threads when programming. +\end{itemize} +\end{block} + +\begin{exampleblock}{Threadpool} +A threadpool with a commond concurrent queue of tasks is a common practice in networking servers. +\end{exampleblock} + +\begin{exampleblock}{Work stealing} +Another implementation is Cilk\cite{cilk} - where each processor has a stack of frames. +\end{exampleblock} + + +\end{frame} + + +\subsection{Convolution} + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{1D gaussian filter} + +\begin{columns}[t] + +\column{2.5in} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{n}{serialConvolution}\PY{p}{(}\PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{output}\PY{p}{,} + \PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{input}\PY{p}{,} + \PY{k}{const} \PY{n}{std}\PY{o}{:}\PY{o}{:}\PY{n}{vector}\PY{o}{<}\PY{k+kt}{float}\PY{o}{>}\PY{o}{&} \PY{n}{kernel}\PY{p}{)} +\PY{p}{\PYZob{}} + \PY{c+c1}{// skipping the edges: separate loops, paddings} + \PY{c+c1}{// output.size == input.size()-kernel.size()-1;} + + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{output}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i}\PY{o}{+}\PY{o}{+}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{k+kt}{float} \PY{n}{sum} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{j} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{j} \PY{o}{<}\PY{o}{=} \PY{n}{kernel}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{j}\PY{o}{+}\PY{o}{+}\PY{p}{)} + \PY{n}{sum} \PY{o}{+}\PY{o}{=} \PY{n}{input}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{n}{j}\PY{p}{]} \PY{o}{*} \PY{n}{kernel}\PY{p}{[}\PY{n}{j}\PY{p}{]}\PY{p}{;} + + \PY{n}{output}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{sum}\PY{p}{;} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} +\end{Verbatim} + + +\end{block} + +\column{1.5in} + +\includegraphics[height=3cm]{1d_gauss.png} + +\end{columns} + +\begin{exampleblock}{Note} +\tiny +\verb|float kernel[7] = { 0.06, 0.061, 0.242, 0.383, 0.242, 0.061, 0.06 }| +\end{exampleblock} + +\end{frame} + + +%----------- slide --------------------------------------------------% + + +\begin{frame}[fragile]{Optimized convolution} + +\begin{block}{c++ code} + +\tiny +\begin{Verbatim}[commandchars=\\\{\},numbers=left,firstnumber=1,stepnumber=1] +\PY{k+kt}{void} \PY{k}{operator}\PY{p}{(}\PY{p}{)}\PY{p}{(}\PY{k}{const} \PY{n}{tbb}\PY{o}{:}\PY{o}{:}\PY{n}{blocked\PYZus{}range}\PY{o}{<}\PY{n}{size\PYZus{}t}\PY{o}{>}\PY{o}{&} \PY{n}{r}\PY{p}{)} \PY{k}{const} +\PY{p}{\PYZob{}} + \PY{c+c1}{// skipping the edges, shall be done in separate task} + \PY{k}{const} \PY{k+kt}{float}\PY{o}{*} \PY{n}{p} \PY{o}{=} \PY{o}{&}\PY{n}{m\PYZus{}input}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]} \PY{o}{+} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} + \PY{k+kt}{float}\PY{o}{*} \PY{n}{d} \PY{o}{=} \PY{o}{&}\PY{n}{m\PYZus{}output}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]} \PY{o}{+} \PY{n}{r}\PY{p}{.}\PY{n}{begin}\PY{p}{(}\PY{p}{)}\PY{p}{;} + + \PY{k}{const} \PY{n}{size\PYZus{}t} \PY{n}{n} \PY{o}{=} \PY{n}{m\PYZus{}kernel}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} + \PY{k+kt}{float} \PY{n}{k}\PY{p}{[}\PY{n}{n}\PY{p}{]}\PY{p}{;} \PY{c+c1}{// pre-read kernel} + \PY{k+kt}{float} \PY{n}{c}\PY{p}{[}\PY{n}{n}\PY{p}{]}\PY{p}{;} \PY{c+c1}{// pre-read values} + \PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]} \PY{o}{=} \PY{n}{m\PYZus{}kernel}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{;} + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{1}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{n}\PY{p}{;} \PY{o}{+}\PY{o}{+}\PY{n}{i}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{n}{c}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{m\PYZus{}input}\PY{p}{[}\PY{n}{i}\PY{o}{-}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{;} + \PY{n}{k}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{m\PYZus{}kernel}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{;} + \PY{p}{\PYZcb{}} + + \PY{c+c1}{// chunk size \PYZpc{} kernel.size() != 0 should be handled...} + \PY{k}{for} \PY{p}{(}\PY{n}{size\PYZus{}t} \PY{n}{i} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{;} \PY{n}{i} \PY{o}{<} \PY{n}{r}\PY{p}{.}\PY{n}{size}\PY{p}{(}\PY{p}{)}\PY{p}{;} \PY{n}{i} \PY{o}{+}\PY{o}{=} \PY{n}{n}\PY{p}{)} \PY{p}{\PYZob{}} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{0}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{1}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{2}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{2}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{3}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{3}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{4}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{4}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{5}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{5}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{n}{d}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{6}\PY{p}{]} \PY{o}{=} \PY{p}{(}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]} \PY{o}{=} \PY{n}{p}\PY{p}{[}\PY{n}{i}\PY{o}{+}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{)}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{2}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{3}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{5}\PY{p}{]}\PY{o}{+}\PY{n}{c}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{*}\PY{n}{k}\PY{p}{[}\PY{l+m+mi}{6}\PY{p}{]}\PY{p}{;} + \PY{p}{\PYZcb{}} +\PY{p}{\PYZcb{}} +\end{Verbatim} + + +\end{block} + +\end{frame} + + +%----------- slide --------------------------------------------------% + +\begin{frame}[fragile]{Convolution running times} + +\begin{center} +\includegraphics[height=5cm]{convolution.png} +\end{center} + + +\begin{exampleblock}{Note} +\small +Memory-read optimalization can result the same performance improvements as parallelization. +\end{exampleblock} + +\end{frame} + +%----------- slide --------------------------------------------------% + + + +\subsection{Summary} + +\begin{frame}{Things to keep in mind} + +\begin{block}{Checklist} + \small + \begin{itemize} + \item Pass primitive types by value. + \item Pass objects by address. + \item Have function-local copies of member variables. + \item Avoid to read values multiple times. + \item Choose correct chunk size. + \item Instead of shared memory, consider reduction. + \item Plan datastructures to avoid memory-boundings.* + \end{itemize} +\end{block} + +\pause + +\begin{exampleblock}{*data-oriented design\cite{data_oriented_design}} +\small +If only someone could tell us more about it... +\end{exampleblock} + +\end{frame} + + +\begin{frame}{Links} + +\tiny +\begin{thebibliography}{100} +\bibitem{openmp}openMP.\url{http://openmp.org} +\bibitem{itbb}Intel Thread Building Blocks.\url{http://threadingbuildingblocks.org/} +\bibitem{qtconcurrent}QtConcurrent.\url{http://doc.qt.nokia.com/4.8-snapshot/qtconcurrent.html} +\bibitem{cilk}Cilk.\url{http://software.intel.com/en-us/articles/intel-cilk-plus} +\bibitem{itbb_openmp_nativethreads}Comparison of Intel TBB, openMP and native threads.\url{http://software.intel.com/en-us/articles/intel-threading-building-blocks-openmp-or-native-threads/} + +\bibitem{cpp_thread}std::thread in C++\url{http://en.cppreference.com/w/cpp/thread} +\bibitem{posix_threads}POSIX threads tutorial.\url{http://www.yolinux.com/TUTORIALS/LinuxTutorialPosixThreads.html} +\bibitem{qt_thread}Qt threads.\url{http://qt-project.org/doc/qt-4.8/threads.html} + +\bibitem{data_oriented_design}Data oriented design.\url{http://gamesfromwithin.com/data-oriented-design} + +\bibitem{latex_beamer}\LaTeX{} beamer class for creating presentations.\url{https://bitbucket.org/rivanvx/beamer/wiki/Home} +\bibitem{gnuplot}Gnuplot - An open source plotting software.\url{http://www.gnuplot.info/} + +\end{thebibliography} +\end{frame} + + +\end{document} \ No newline at end of file