[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...
From: |
Ethan Eade |
Subject: |
[libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc... |
Date: |
Fri, 22 Feb 2008 15:24:34 +0000 |
CVSROOT: /cvsroot/libcvd
Module name: libcvd
Changes by: Ethan Eade <ethaneade> 08/02/22 15:24:34
Modified files:
cvd : convolution.h
cvd_src : convolution.cc
cvd/internal : aligned_mem.h
Log message:
Added SSE optimized gaussian convolution for float -> float. Also
added AlignedMem
scope guard to automatically release aligned memory.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/convolution.h?cvsroot=libcvd&r1=1.9&r2=1.10
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/convolution.cc?cvsroot=libcvd&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/internal/aligned_mem.h?cvsroot=libcvd&r1=1.10&r2=1.11
Patches:
Index: cvd/convolution.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/convolution.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- cvd/convolution.h 4 Aug 2007 00:08:51 -0000 1.9
+++ cvd/convolution.h 22 Feb 2008 15:24:33 -0000 1.10
@@ -383,9 +383,9 @@
template <class S> static inline T at(const T* input, const S& factor, const
S* ) { return *input * factor; }
};
-#if 1
template <class T,class S> inline const T* convolveMiddle(const T* input,
const S& factor, const S* kernel, int ksize, int n, T* output) {
-#define CALL_CM(I) for (int j=0; j<n; j++, input++) { *(output++) =
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+#define CALL_CM(I) for (int j=0; j<n; ++j, ++input, ++output) { *output =
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+
switch (ksize) {
case 0: CALL_CM(0);
case 1: CALL_CM(1);
@@ -406,14 +406,43 @@
#undef CALL_CM
}
-#else
-template <class T,class S> const T* convolveMiddle(const T* input, const S&
factor, const S* kernel, int ksize, int n, T* output) {
- assign_multiple(input, factor, output, n);
- for (int r=0; r <ksize; r++) {
- add_multiple_of_sum(input-r-1, input+r+1, kernel[r], output, n);
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+
+#include <xmmintrin.h>
+
+template <class S> const float* convolveMiddle(const float* input, const S&
factor, const S* kernel, int ksize, int n, float* output) {
+ __m128 kkkk[ksize+1] __attribute__ ((aligned(16)));
+ kkkk[0] = _mm_set1_ps(factor);
+ for (int i=1; i<=ksize; i++)
+ kkkk[i] = _mm_set1_ps(kernel[i-1]);
+
+ int i=0;
+ for (; i<n && !is_aligned<16>(input); i++, ++input, ++output) {
+ *output = ConvolveMiddle<float,-1>::at(input, factor, kernel, ksize);
+ }
+
+ for (; i<n-3; i+=4) {
+ __m128 sum = _mm_mul_ps(kkkk[0], _mm_load_ps(input));
+ const float* back = input - ksize-1;
+ const float* front = input + ksize+1;
+ const __m128* kp = kkkk+ksize+1;
+ while (++back != --front) {
+ --kp;
+ const __m128& kr = *kp;
+ const __m128 b = _mm_loadu_ps(back);
+ const __m128 f = _mm_loadu_ps(front);
+ sum = _mm_add_ps(sum, _mm_mul_ps(kr, _mm_add_ps(b,f)));
+ }
+ _mm_stream_ps(output, sum);
+ output += 4;
+ input += 4;
}
- return input + n;
+
+ for (; i<n; i++, ++input, ++output) {
+ *output = ConvolveMiddle<float,-1>::at(input, factor, kernel, ksize);
+ }
+ return input;
}
#endif
@@ -429,7 +458,8 @@
typedef typename Pixel::traits<T>::float_type sum_type;
assert(out.size() == I.size());
int ksize = (int)(sigmas*sigma + 0.5);
- sum_comp_type *kernel = new sum_comp_type[ksize];
+ //std::cerr << "sigma: " << sigma << " kernel: " << ksize << std::endl;
+ std::vector<sum_comp_type> kernel(ksize);
sum_comp_type ksum = sum_comp_type();
for (int i=1; i<=ksize; i++)
ksum += (kernel[i-1] =
static_cast<sum_comp_type>(exp(-i*i/(2*sigma*sigma))));
@@ -440,13 +470,16 @@
int h = I.size().y;
int swin = 2*ksize;
- sum_type* buffer = Internal::aligned_mem<sum_type,16>::alloc(w*(swin+1));
- sum_type* rowbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
- sum_type* outbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+ AlignedMem<sum_type,16> buffer(w*(swin+1));
+ AlignedMem<sum_type,16> aligned_rowbuf(w);
+ AlignedMem<sum_type,16> aligned_outbuf(w);
+
+ sum_type* rowbuf = aligned_rowbuf.data();
+ sum_type* outbuf = aligned_outbuf.data();
- sum_type** rows = new sum_type*[swin+1];
+ std::vector<sum_type*> rows(swin+1);
for (int k=0;k<swin+1;k++)
- rows[k] = buffer + k*w;
+ rows[k] = buffer.data() + k*w;
T* output = out.data();
for (int i=0; i<h; i++) {
@@ -461,7 +494,7 @@
}
// middle of row
input += ksize;
- input = convolveMiddle<sum_type, sum_comp_type>(input, factor, kernel,
ksize, w-swin, next_row+ksize);
+ input = convolveMiddle<sum_type, sum_comp_type>(input, factor,
&kernel.front(), ksize, w-swin, next_row+ksize);
// end of row
for (int j=w-ksize; j<w; j++, input++) {
sum_type hsum = *input * factor;
@@ -518,13 +551,13 @@
rows[swin] = tmp;
}
- Internal::aligned_mem<sum_type,16>::release(buffer);
- Internal::aligned_mem<sum_type,16>::release(rowbuf);
- Internal::aligned_mem<sum_type,16>::release(outbuf);
delete[] kernel;
- delete[] rows;
}
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out,
double sigma, double sigmas=3.0);
+#endif
+
template <class T, class O, class K> void convolve_gaussian_3(const
BasicImage<T>& I, BasicImage<O>& out, K k1, K k2)
{
assert(I.size() == out.size());
Index: cvd_src/convolution.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/convolution.cc,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- cvd_src/convolution.cc 26 Oct 2005 20:26:12 -0000 1.1
+++ cvd_src/convolution.cc 22 Feb 2008 15:24:34 -0000 1.2
@@ -104,4 +104,186 @@
}
}
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+inline void convolveMiddle5(const float* in, double factor, const double
kernel[], int count, float* out)
+{
+ int i;
+ const __m128 ffff = _mm_set1_ps(factor);
+ const __m128 k1 = _mm_set1_ps(kernel[0]);
+ const __m128 k2 = _mm_set1_ps(kernel[1]);
+ for (i=0; i+3<count; i+=4, in += 4, out += 4) {
+ __m128 sum = (_mm_mul_ps(_mm_loadu_ps(in), ffff) +
+ _mm_mul_ps(k1, (_mm_loadu_ps(in-1) + _mm_loadu_ps(in+1)))
+
+ _mm_mul_ps(k2, (_mm_loadu_ps(in-2) +
_mm_loadu_ps(in+2))));
+ _mm_storeu_ps(out, sum);
+ }
+
+ for (; i<count; ++i, ++in, ++out) {
+ double sum = in[0] * factor + kernel[0] * (in[-1] + in[1]) + kernel[1]
* (in[-2] * in[2]);
+ *out = sum;
+ }
+}
+
+inline void convolveMiddle(const float* in, double factor, const
vector<double>& kernel, int count, float* out)
+{
+ const int ksize = kernel.size();
+ if (ksize == 2) {
+ convolveMiddle5(in, factor, &kernel[0], count, out);
+ return;
+ }
+ int i;
+ const __m128 ffff = _mm_set1_ps(factor);
+ for (i=0; i+3<count; i+=4, in += 4, out += 4) {
+ __m128 sum = _mm_mul_ps(_mm_loadu_ps(in), ffff);
+ for (int k=1; k<=ksize; ++k)
+ sum += _mm_set1_ps(kernel[k-1]) * (_mm_loadu_ps(in-k) +
_mm_loadu_ps(in+k));
+ _mm_storeu_ps(out, sum);
+ }
+
+ for (; i<count; ++i, ++in, ++out) {
+ double sum = in[0] * factor;
+ for (int k=1; k<=ksize; ++k)
+ sum += kernel[k-1] * (in[-k] + in[k]);
+ *out = sum;
+ }
+}
+
+inline void convolveVertical5(const vector<float*> row, double factor, double
kernel[], int count, float* out)
+{
+ const int ksize = 2;
+ int i;
+ for (i=0; i<count && !is_aligned<16>(row[0] + i); ++i, ++out) {
+ double sum = row[ksize][i] * factor + (row[1][i] + row[3][i]) *
kernel[0] + (row[0][i] + row[4][i]) * kernel[1];
+ *out = sum;
+ }
+
+ const __m128 ffff = _mm_set1_ps(factor);
+ const __m128 k1 = _mm_set1_ps(kernel[0]);
+ const __m128 k2 = _mm_set1_ps(kernel[1]);
+
+ for (; i+3<count; i+=4) {
+ __m128 sum = (_mm_load_ps(row[ksize] + i) * ffff
+ + (_mm_load_ps(row[ksize-1]+i) +
_mm_load_ps(row[ksize+1]+i)) * k1
+ + (_mm_load_ps(row[ksize-2]+i) +
_mm_load_ps(row[ksize+2]+i)) * k2);
+ _mm_store_ps(out + i, sum);
+ }
+ for (; i<count; ++i, ++out) {
+ double sum = row[ksize][i] * factor + (row[1][i] + row[3][i]) *
kernel[0] + (row[0][i] + row[4][i]) * kernel[1];
+ *out = sum;
+ }
+}
+
+inline void convolveVertical(const vector<float*> row, double factor,
vector<double>& kernel, int count, float* out)
+{
+ const int ksize = kernel.size();
+ if (ksize == 2) {
+ convolveVertical5(row, factor, &kernel[0], count, out);
+ return;
+ }
+
+ int i;
+ for (i=0; i<count && !is_aligned<16>(row[0] + i); ++i, ++out) {
+ double sum = row[ksize][i] * factor;
+ for (int k=1; k<=ksize; ++k)
+ sum += kernel[k-1] * (row[ksize-k][i] + row[ksize+k][i]);
+ *out = sum;
+ }
+ const __m128 ffff = _mm_set1_ps(factor);
+ for (; i+3<count; i+=4) {
+ __m128 sum = _mm_mul_ps(_mm_load_ps(row[ksize] + i), ffff);
+ for (int k=1; k<=ksize; ++k)
+ sum += _mm_set1_ps(kernel[k-1]) * (_mm_load_ps(row[ksize-k]+i) +
_mm_load_ps(row[ksize+k]+i));
+ _mm_store_ps(out + i, sum);
+ }
+ for (; i<count; ++i, ++out) {
+ double sum = row[ksize][i] * factor;
+ for (int k=1; k<=ksize; ++k)
+ sum += kernel[k-1] * (row[ksize-k][i] + row[ksize+k][i]);
+ *out = sum;
+ }
+}
+
+void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out,
double sigma, double sigmas)
+{
+ assert(out.size() == I.size());
+ int ksize = (int)(sigmas*sigma + 0.5);
+ vector<double> kernel(ksize);
+ double ksum = 1.0;
+ for (int i=1; i<=ksize; i++)
+ ksum += 2*(kernel[i-1] = exp(-i*i/(2*sigma*sigma)));
+ double factor = 1.0/ksum;
+ for (int i=0; i<ksize; i++)
+ kernel[i] *= factor;
+ const int w = I.size().x;
+ const int h = I.size().y;
+ const int swin = 2*ksize;
+
+ AlignedMem<float,16> buffer(w*(swin+1));
+ vector<float*> rows(swin+1);
+
+ for (int k=0;k<swin+1;k++)
+ rows[k] = buffer.data() + k*w;
+
+ float* output = out.data();
+ for (int i=0; i<h; i++) {
+ float* next_row = rows[swin];
+ const float* input = I[i];
+ // beginning of row
+ for (int j=0; j<ksize; j++) {
+ double hsum = input[j] * factor;
+ for (int k=0; k<ksize; k++)
+ hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+ next_row[j] = hsum;
+ }
+ // middle of row
+ input += ksize;
+ convolveMiddle(input, factor, kernel, w-swin, next_row+ksize);
+ input += w-swin;
+ // end of row
+ for (int j=w-ksize; j<w; j++, input++) {
+ double hsum = *input * factor;
+ const int room = w-j;
+ for (int k=0; k<ksize; k++) {
+ hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] :
input[k+1])) * kernel[k];
+ }
+ next_row[j] = hsum;
+ }
+ // vertical
+ if (i >= swin) {
+ convolveVertical(rows, factor, kernel, w, output);
+ output += w;
+ if (i == h-1) {
+ for (int r=0; r<ksize; r++, output += w) {
+ vector<float*> rrows(rows.size());
+ rrows[ksize] = rows[ksize+r+1];
+ for (int k=0; k<ksize; ++k) {
+ rrows[ksize-k-1] = rows[ksize+r-k];
+ int hi = ksize+r+k+2;
+ rrows[ksize+k+1] = rows[hi > swin ? 2*swin - hi : hi];
+ }
+ convolveVertical(rrows, factor, kernel, w, output);
+ }
+ }
+ } else if (i == swin-1) {
+ for (int r=0; r<ksize; r++, output += w) {
+ vector<float*> rrows(rows.size());
+ rrows[ksize] = rows[r+1];
+ for (int k=0; k<ksize; ++k) {
+ rrows[ksize-k-1] = rows[abs(r-k-1)+1];
+ rrows[ksize+k+1] = rows[r+k+2];
+ }
+ convolveVertical(rrows, factor, kernel, w, output);
+ }
+ }
+
+ float* tmp = rows[0];
+ for (int r=0;r<swin; r++)
+ rows[r] = rows[r+1];
+ rows[swin] = tmp;
+ }
+}
+#endif
+
+
};
+
Index: cvd/internal/aligned_mem.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/internal/aligned_mem.h,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- cvd/internal/aligned_mem.h 15 Jan 2008 15:28:07 -0000 1.10
+++ cvd/internal/aligned_mem.h 22 Feb 2008 15:24:34 -0000 1.11
@@ -96,6 +96,20 @@
template <class T, int N> std::map<T*,typename aligned_mem<T,N>::entry>
aligned_mem<T,N>::buffers;
}
+
+ template <class T, int N> struct AlignedMem {
+ T* mem;
+ AlignedMem(size_t count) {
+ mem = Internal::aligned_mem<T,N>::alloc(count);
+ }
+ ~AlignedMem() {
+ Internal::aligned_mem<T,N>::release(mem);
+ }
+ T* data() { return mem; }
+ const T* data() const { return mem; }
+ };
+
+
}
#endif
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...,
Ethan Eade <=