Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
programming:c-cpp-performance [2008/01/30 07:37] 127.0.0.1 external edit |
programming:c-cpp-performance [2013/09/19 16:40] (current) |
||
---|---|---|---|
Line 26: | Line 26: | ||
You can even use templates to have a generic function : | You can even use templates to have a generic function : | ||
<code cpp> | <code cpp> | ||
- | <template typename T> | + | template |
inline T sqr(T x) { return x*x; } | inline T sqr(T x) { return x*x; } | ||
</ | </ | ||
Line 33: | Line 33: | ||
<code cpp> | <code cpp> | ||
int x = 5; | int x = 5; | ||
- | int y = sqr< | + | int y = sqr(x); |
+ | int y2 = sqr< | ||
</ | </ | ||
Line 120: | Line 121: | ||
==== Other profilers ==== | ==== Other profilers ==== | ||
- | sysprof and oprofile are other profiles | + | sysprof and oprofile are other profilers |
==== Traps ==== | ==== Traps ==== | ||
Line 177: | Line 178: | ||
Therefore all simple counters should be done in the natural word size of the CPU. | Therefore all simple counters should be done in the natural word size of the CPU. | ||
+ | ====== Vectorialization ====== | ||
+ | |||
+ | Since Pentium III with SSE instructions, | ||
+ | |||
+ | However there are two problems: | ||
+ | * these operations are anyway quite fast (especially because it is a single operation), so if you need to do too much data reorganization to have them as a contiguous vector, it will quickly create too much overhead that cancels what you win by parallelizing the operations. | ||
+ | * data **must** be 16 bytes aligned, so it can prevent you to directly use raw data (eg if you want to compute a haar feature with size not multiple of 4 with an integral image), which raises the previous problem... | ||
+ | |||
+ | But still when you can use it, it can worth the pain, especially with divisions or sqrt that are especially cycles consuming, eg to compute 4 parabolic interpolations simultaneously: | ||
+ | |||
+ | <code cpp> | ||
+ | struct SSE_f | ||
+ | { | ||
+ | typedef float v4sf __attribute__((vector_size(16))); | ||
+ | union { v4sf v; float f[4]; }; | ||
+ | }; | ||
+ | |||
+ | inline void parabolicInterpolation4( | ||
+ | const SSE_f &x0, const SSE_f &y0, const SSE_f &x1, const SSE_f &y1, const SSE_f &x2, const SSE_f & | ||
+ | SSE_f & | ||
+ | { | ||
+ | SSE_f x01; x01.v = _mm_sub_ps(x0.v, | ||
+ | SSE_f x02; x02.v = _mm_sub_ps(x0.v, | ||
+ | SSE_f x12; x12.v = _mm_sub_ps(x1.v, | ||
+ | SSE_f t0; t0.v = _mm_div_ps(y0.v, | ||
+ | SSE_f t1; t1.v = _mm_div_ps(y1.v, | ||
+ | SSE_f t2; t2.v = _mm_div_ps(y2.v, | ||
+ | a.v = _mm_add_ps(_mm_sub_ps(t0, | ||
+ | b.v = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(x0.v, | ||
+ | _mm_add_ps(_mm_mul_ps(_mm_add_ps(x1.v, | ||
+ | | ||
+ | c.v = _mm_add_ps(_mm_sub_ps( | ||
+ | _mm_mul_ps(_mm_mul_ps(x1.v, | ||
+ | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
+ | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
+ | extremum_x.v = _mm_div_ps(b.v, | ||
+ | extremum_y.v = _mm_sub_ps(c.v, | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | It takes roughly 60% less time than to compute them sequentially, | ||
+ | |||
+ | References: | ||
+ | * [[http:// | ||
+ | * [[http:// | ||
+ | * [[http:// | ||
+ | |||
+ | More simple names are available in GCC headers: | ||
+ | * SSE for float (< | ||
+ | * SSE2 for int (< | ||
+ | * SSE2 int/float conversions: | ||
+ | |||
+ | And you have to compile with GCC flags -msse and -msse2, or one -march that supports it. | ||
====== Measuring performance ====== | ====== Measuring performance ====== | ||
Line 186: | Line 240: | ||
#include < | #include < | ||
struct timeval tv; | struct timeval tv; | ||
- | struct timezone tz; | + | gettimeofday(& |
- | gettimeofday(& | + | |
unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows | unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows |