Differences
This shows you the differences between two versions of the page.
| Both sides previous revision Previous revision Next revision | Previous revision | ||
|
programming:c-cpp-performance [2008/01/30 07:37] 127.0.0.1 external edit |
programming:c-cpp-performance [2013/09/19 16:40] (current) |
||
|---|---|---|---|
| Line 26: | Line 26: | ||
| You can even use templates to have a generic function : | You can even use templates to have a generic function : | ||
| <code cpp> | <code cpp> | ||
| - | <template typename T> | + | template |
| inline T sqr(T x) { return x*x; } | inline T sqr(T x) { return x*x; } | ||
| </ | </ | ||
| Line 33: | Line 33: | ||
| <code cpp> | <code cpp> | ||
| int x = 5; | int x = 5; | ||
| - | int y = sqr< | + | int y = sqr(x); |
| + | int y2 = sqr< | ||
| </ | </ | ||
| Line 120: | Line 121: | ||
| ==== Other profilers ==== | ==== Other profilers ==== | ||
| - | sysprof and oprofile are other profiles | + | sysprof and oprofile are other profilers |
| ==== Traps ==== | ==== Traps ==== | ||
| Line 177: | Line 178: | ||
| Therefore all simple counters should be done in the natural word size of the CPU. | Therefore all simple counters should be done in the natural word size of the CPU. | ||
| + | ====== Vectorialization ====== | ||
| + | |||
| + | Since Pentium III with SSE instructions, | ||
| + | |||
| + | However there are two problems: | ||
| + | * these operations are anyway quite fast (especially because it is a single operation), so if you need to do too much data reorganization to have them as a contiguous vector, it will quickly create too much overhead that cancels what you win by parallelizing the operations. | ||
| + | * data **must** be 16 bytes aligned, so it can prevent you to directly use raw data (eg if you want to compute a haar feature with size not multiple of 4 with an integral image), which raises the previous problem... | ||
| + | |||
| + | But still when you can use it, it can worth the pain, especially with divisions or sqrt that are especially cycles consuming, eg to compute 4 parabolic interpolations simultaneously: | ||
| + | |||
| + | <code cpp> | ||
| + | struct SSE_f | ||
| + | { | ||
| + | typedef float v4sf __attribute__((vector_size(16))); | ||
| + | union { v4sf v; float f[4]; }; | ||
| + | }; | ||
| + | |||
| + | inline void parabolicInterpolation4( | ||
| + | const SSE_f &x0, const SSE_f &y0, const SSE_f &x1, const SSE_f &y1, const SSE_f &x2, const SSE_f & | ||
| + | SSE_f & | ||
| + | { | ||
| + | SSE_f x01; x01.v = _mm_sub_ps(x0.v, | ||
| + | SSE_f x02; x02.v = _mm_sub_ps(x0.v, | ||
| + | SSE_f x12; x12.v = _mm_sub_ps(x1.v, | ||
| + | SSE_f t0; t0.v = _mm_div_ps(y0.v, | ||
| + | SSE_f t1; t1.v = _mm_div_ps(y1.v, | ||
| + | SSE_f t2; t2.v = _mm_div_ps(y2.v, | ||
| + | a.v = _mm_add_ps(_mm_sub_ps(t0, | ||
| + | b.v = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(x0.v, | ||
| + | _mm_add_ps(_mm_mul_ps(_mm_add_ps(x1.v, | ||
| + | | ||
| + | c.v = _mm_add_ps(_mm_sub_ps( | ||
| + | _mm_mul_ps(_mm_mul_ps(x1.v, | ||
| + | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
| + | _mm_mul_ps(_mm_mul_ps(x0.v, | ||
| + | extremum_x.v = _mm_div_ps(b.v, | ||
| + | extremum_y.v = _mm_sub_ps(c.v, | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | It takes roughly 60% less time than to compute them sequentially, | ||
| + | |||
| + | References: | ||
| + | * [[http:// | ||
| + | * [[http:// | ||
| + | * [[http:// | ||
| + | |||
| + | More simple names are available in GCC headers: | ||
| + | * SSE for float (< | ||
| + | * SSE2 for int (< | ||
| + | * SSE2 int/float conversions: | ||
| + | |||
| + | And you have to compile with GCC flags -msse and -msse2, or one -march that supports it. | ||
| ====== Measuring performance ====== | ====== Measuring performance ====== | ||
| Line 186: | Line 240: | ||
| #include < | #include < | ||
| struct timeval tv; | struct timeval tv; | ||
| - | struct timezone tz; | + | gettimeofday(& |
| - | gettimeofday(& | + | |
| unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows | unsigned microseconds = tv.tv_sec*1000000 + tv.tv_usec; // beware overflows | ||
