libcvd-members
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[libcvd-members] libcvd cvd/utility.h cvd_src/utility.cc


From: Ethan Eade
Subject: [libcvd-members] libcvd cvd/utility.h cvd_src/utility.cc
Date: Wed, 25 Oct 2006 00:47:45 +0000

CVSROOT:        /cvsroot/libcvd
Module name:    libcvd
Changes by:     Ethan Eade <ethaneade>  06/10/25 00:47:45

Modified files:
        cvd            : utility.h 
        cvd_src        : utility.cc 

Log message:
        Added new operations square and subtract_square with SIMD 
specializations
        for floats.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/utility.h?cvsroot=libcvd&r1=1.4&r2=1.5
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/utility.cc?cvsroot=libcvd&r1=1.6&r2=1.7

Patches:
Index: cvd/utility.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/utility.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- cvd/utility.h       16 May 2006 13:14:37 -0000      1.4
+++ cvd/utility.h       25 Oct 2006 00:47:45 -0000      1.5
@@ -24,9 +24,7 @@
 in the output image
   @ingroup gImageIO
   */
-  template<class S, class T> void copy(const BasicImage<S>& in, BasicImage<T>&
-out, ImageRef size=ImageRef(-1,-1), ImageRef begin = ImageRef(), ImageRef dst =
-ImageRef())
+  template<class S, class T> void copy(const BasicImage<S>& in, BasicImage<T>& 
out, ImageRef size=ImageRef(-1,-1), ImageRef begin = ImageRef(), ImageRef dst = 
ImageRef())
   {
     if (size.x == -1 && size.y == -1)
       size = in.size();
@@ -119,10 +117,10 @@
   /// Compute pointwise a_i * c and store in out_i
   /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
   /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
-  template <class A, class B> inline void assign_multiple(const A* a, const A& 
c,  B* out, unsigned int count)
+      template <class A, class B, class C> inline void assign_multiple(const 
A* a, const B& c,  C* out, unsigned int count)
   {
       while (count--)
-         *(out++) = *(a++) * c;
+         *(out++) = static_cast<C>(*(a++) * c);
   }
 
   /// Compute sum(a_i*b_i)
@@ -146,6 +144,22 @@
       }
   };
  
+  template <class T1, class T2> inline void square(const T1* in, T2* out, 
size_t count) 
+  {
+      while (count--) {
+         *(out++) = static_cast<T2>(*in * *in);
+         ++in;
+      }
+  }
+
+  template <class T1, class T2> inline void subtract_square(const T1* in, T2* 
out, size_t count) 
+  {
+      while (count--) {
+         *(out++) -= static_cast<T2>(*in * *in);
+         ++in;
+      }
+  }
+
   /// Compute sum of (a_i - b_i)^2 (the SSD)
   /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
   /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
@@ -176,6 +190,8 @@
   void assign_multiple(const float* a, const float& c,  float* out, unsigned 
int count);
   double inner_product(const float* a, const float* b, unsigned int count);
   double sum_squared_differences(const float* a, const float* b, size_t count);
+  void square(const float* in, float* out, size_t count);
+  void subtract_square(const float* in, float* out, size_t count);
 #endif
 
 #if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)

Index: cvd_src/utility.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/utility.cc,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -b -r1.6 -r1.7
--- cvd_src/utility.cc  29 May 2006 12:42:26 -0000      1.6
+++ cvd_src/utility.cc  25 Oct 2006 00:47:45 -0000      1.7
@@ -142,6 +142,47 @@
        return sum;
     }    
 
+template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_square(const T1* in, T2* out, size_t count)
+{
+    if (count < M*2) {
+       return F::unaligned_square(in,out,count);
+    }
+    if (!is_aligned<A>(in)) {
+       unsigned int steps = steps_to_align<A>(in);
+       F::unaligned_square(in,out,steps);
+       count -= steps;
+       in += steps;
+       out += steps;
+       if (count < M) {
+           F::unaligned_square(in,out,count);
+       }
+    }
+    size_t block = (count/M)*M;
+    F::aligned_square(in,out,block);
+    if (count > block)
+       F::unaligned_square(in+block,out+block,count-block);
+}    
+
+template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_subtract_square(const T1* in, T2* out, size_t count)
+{
+    if (count < M*2) {
+       return F::unaligned_subtract_square(in,out,count);
+    }
+    if (!is_aligned<A>(in)) {
+       unsigned int steps = steps_to_align<A>(in);
+       F::unaligned_subtract_square(in,out,steps);
+       count -= steps;
+       in += steps;
+       out += steps;
+       if (count < M) {
+           F::unaligned_subtract_square(in,out,count);
+       }
+    }
+    size_t block = (count/M)*M;
+    F::aligned_subtract_square(in,out,block);
+    if (count > block)
+       F::unaligned_subtract_square(in+block,out+block,count-block);
+}    
 
 
 namespace CVD {
@@ -211,10 +252,13 @@
     template <bool Aligned> inline void store_ps(__m128 m, void* addr) { 
return _mm_storeu_ps((float*)addr, m); }
     template <> inline void store_ps<true>(__m128 m, void* addr) { return 
_mm_store_ps((float*)addr, m); }
 
-    template <bool Aligned_b> inline void float_differences(const __m128* a, 
const __m128* b, __m128* diff, unsigned int count)
+    template <bool Aligned_b> void float_differences(const __m128* a, const 
__m128* b, __m128* diff, unsigned int count)
     {
        while (count--) {
-           *(diff++) = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+           _mm_stream_ps((float*)diff, _mm_sub_ps(load_ps<true>(a), 
load_ps<Aligned_b>(b)));
+           ++diff;
+           ++a;
+           ++b;
        }
     }
     
@@ -222,8 +266,10 @@
     {
        __m128 cccc = _mm_set1_ps(c);
        while (count--) {
-           *out = _mm_add_ps(_mm_mul_ps(_mm_add_ps(*(a++), 
load_ps<Aligned_b>(b++)), cccc), *out);
+           *out = _mm_add_ps(_mm_mul_ps(_mm_add_ps(load_ps<true>(a), 
load_ps<Aligned_b>(b)), cccc), *out);
            ++out;
+           ++a;
+           ++b;
        }
     }
 
@@ -273,6 +319,25 @@
        return ssd;
     }
 
+    template <bool Aligned_out> void float_square(const __m128* in, __m128* 
out, size_t count) {
+       while (count--) {
+           __m128 x = load_ps<true>(in);
+           store_ps<Aligned_out>(_mm_mul_ps(x, x), out);
+           ++in;
+           ++out;
+       }
+    }
+
+    template <bool Aligned_out> void float_subtract_square(const __m128* in, 
__m128* out, size_t count) {
+       while (count--) {
+           __m128 x = load_ps<true>(in);
+           __m128 y = load_ps<Aligned_out>(out);
+           store_ps<Aligned_out>(_mm_sub_ps(y, _mm_mul_ps(x, x)), out);
+           ++in;
+           ++out;
+       }
+    }
+
     struct SSE_funcs {
        template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
            differences<T1,T2>(a,b,diff,count);
@@ -328,6 +393,27 @@
            else
                return float_sum_squared_differences<false>((const __m128*) a, 
(const __m128*) b, count>>2);
        }       
+       
+       template <class T1, class T2> static inline void unaligned_square(const 
T1* in, T2* out, size_t count) {
+           square<T1,T2>(in, out, count);
+       }
+
+       static inline void aligned_square(const float* in, float* out, size_t 
count) {
+           if (is_aligned<16>(out))
+               float_square<true>((const __m128*)in, (__m128*)out, count >> 2);
+           else
+               float_square<false>((const __m128*)in, (__m128*)out, count >> 
2);               
+       }
+       template <class T1, class T2> static inline void 
unaligned_subtract_square(const T1* in, T2* out, size_t count) {
+           subtract_square<T1,T2>(in, out, count);
+       }
+
+       static inline void aligned_subtract_square(const float* in, float* out, 
size_t count) {
+           if (is_aligned<16>(out))
+               float_subtract_square<true>((const __m128*)in, (__m128*)out, 
count >> 2);
+           else
+               float_subtract_square<false>((const __m128*)in, (__m128*)out, 
count >> 2);              
+       }
     };
     
     void differences(const float* a, const float* b, float* diff, unsigned int 
size)
@@ -356,6 +442,16 @@
        return maybe_aligned_ssd<SSE_funcs,double,float,16,4>(a,b,count);
     }
 
+    void square(const float* in, float* out, size_t count) 
+    {
+       maybe_aligned_square<SSE_funcs,float,float,16,4>(in, out, count);
+    }
+
+    void subtract_square(const float* in, float* out, size_t count) 
+    {
+       maybe_aligned_subtract_square<SSE_funcs,float,float,16,4>(in, out, 
count);
+    }
+
 #endif
 
 #if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]