Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
programming:c-cpp-performance [2008/10/21 16:14] cyril |
programming:c-cpp-performance [2013/09/19 16:40] (current) |
||
---|---|---|---|
Line 121: | Line 121: | ||
==== Other profilers ==== | ==== Other profilers ==== | ||
- | sysprof and oprofile are other profiles | + | sysprof and oprofile are other profilers |
==== Traps ==== | ==== Traps ==== | ||
Line 178: | Line 178: | ||
Therefore all simple counters should be done in the natural word size of the CPU. | Therefore all simple counters should be done in the natural word size of the CPU. | ||
+ | ====== Vectorialization ====== | ||
+ | |||
+ | Since Pentium III with SSE instructions, | ||
+ | |||
+ | However there are two problems: | ||
+ | * these operations are anyway quite fast (especially because it is a single operation), so if you need to do too much data reorganization to have them as a contiguous vector, it will quickly create too much overhead that cancels what you win by parallelizing the operations. | ||
+ | * data **must** be 16 bytes aligned, so it can prevent you to directly use raw data (eg if you want to compute a haar feature with size not multiple of 4 with an integral image), which raises the previous problem... | ||
+ | |||
+ | But still when you can use it, it can worth the pain, especially with divisions or sqrt that are especially cycles consuming, eg to compute 4 parabolic interpolations simultaneously: | ||
+ | |||
+ | <code cpp> | ||
+ | struct SSE_f | ||
+ | { | ||
+ | typedef float v4sf __attribute__((vector_size(16))); | ||
+ | union { v4sf v; float f[4]; }; | ||
+ | }; | ||
+ | |||
+ | inline void parabolicInterpolation4( | ||
+ | const SSE_f &x0, const SSE_f &y0, const SSE_f &x1, const SSE_f &y1, const SSE_f &x2, const SSE_f & | ||
+ | SSE_f & | ||
+ | { | ||
+ | SSE_f x01; x01.v = _mm_sub_ps(x0.v, | ||
+ | SSE_f x02; x02.v = _mm_sub_ps(x0.v, | ||
+ | SSE_f x12; x12.v = _mm_sub_ps(x1.v, | ||
+ | SSE_f t0; t0.v = _mm_div_ps(y0.v, | ||
+ | SSE_f t1; t1.v = _mm_div_ps(y1.v, | ||
+ | SSE_f t2; t2.v = _mm_div_ps(y2.v, | ||
+ | a.v = _mm_add_ps(_mm_sub_ps(t0, | ||
+ | b.v = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(x0.v, | ||
+ | _mm_add_ps(_mm_mul_ps(_mm_add_ps(x1.v, | ||
+ | | ||
+ | c.v = _mm_add_ps(_mm_sub_ps( | ||
+ | _mm_mul_ps(_mm_mul_ps(x1.v, | ||
+ | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
+ | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
+ | extremum_x.v = _mm_div_ps(b.v, | ||
+ | extremum_y.v = _mm_sub_ps(c.v, | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | It takes roughly 60% less time than to compute them sequentially, | ||
+ | |||
+ | References: | ||
+ | * [[http:// | ||
+ | * [[http:// | ||
+ | * [[http:// | ||
+ | |||
+ | More simple names are available in GCC headers: | ||
+ | * SSE for float (< | ||
+ | * SSE2 for int (< | ||
+ | * SSE2 int/float conversions: | ||
+ | |||
+ | And you have to compile with GCC flags -msse and -msse2, or one -march that supports it. | ||
====== Measuring performance ====== | ====== Measuring performance ====== | ||
Line 187: | Line 240: | ||
#include < | #include < | ||
struct timeval tv; | struct timeval tv; | ||
- | struct timezone tz; | + | gettimeofday(& |
- | gettimeofday(& | + | |
unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows | unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows |