libcvd-members
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Libcvd-members] libcvd cvd/convolution.h cvd/utility.h cvd_src/... [vid


From: Ethan Eade
Subject: [Libcvd-members] libcvd cvd/convolution.h cvd/utility.h cvd_src/... [videofilebuffer_test_23052006]
Date: Wed, 24 May 2006 00:38:32 +0000

CVSROOT:        /cvsroot/libcvd
Module name:    libcvd
Branch:         videofilebuffer_test_23052006
Changes by:     Ethan Eade <address@hidden>     06/05/24 00:38:31

Modified files:
        cvd            : convolution.h utility.h 
        cvd_src        : utility.cc 

Log message:
        Fixed recent changes to convolution.h to avoid explicitly specifying
        function template parameters, so that proper overload resolution is 
used by
        the compiler.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd/convolution.h.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.5&tr2=1.5.2.1&r1=text&r2=text
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd/utility.h.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.4&tr2=1.4.2.1&r1=text&r2=text
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd_src/utility.cc.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.5&tr2=1.5.2.1&r1=text&r2=text

Patches:
Index: libcvd/cvd/convolution.h
diff -u /dev/null libcvd/cvd/convolution.h:1.5.2.1
--- /dev/null   Wed May 24 00:38:31 2006
+++ libcvd/cvd/convolution.h    Wed May 24 00:38:31 2006
@@ -0,0 +1,545 @@
+/*
+        This file is part of the CVD Library.
+
+        Copyright (C) 2005 The Authors
+
+        This library is free software; you can redistribute it and/or
+        modify it under the terms of the GNU Lesser General Public
+        License as published by the Free Software Foundation; either
+        version 2.1 of the License, or (at your option) any later version.
+
+        This library is distributed in the hope that it will be useful,
+        but WITHOUT ANY WARRANTY; without even the implied warranty of
+        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+        Lesser General Public License for more details.
+
+        You should have received a copy of the GNU Lesser General Public
+        License along with this library; if not, write to the Free Software
+        Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#ifndef CVD_CONVOLUTION_H_
+#define CVD_CONVOLUTION_H_
+
+#include <vector>
+#include <memory>
+#include <numeric>
+#include <algorithm>
+
+#include <cvd/config.h>
+#include <cvd/exceptions.h>
+#include <cvd/image.h>
+#include <cvd/internal/pixel_operations.h>
+#include <cvd/internal/aligned_mem.h>
+#include <cvd/utility.h>
+
+namespace CVD {
+
+/// creates a Gaussian kernel with given maximum value and standard deviation.
+/// All elements of the passed vector are filled up, therefore the vector
+/// defines the size of the computed kernel. The normalizing value is returned.
+/// @param k vector of T's holds the kernel values
+/// @param maxval the maximum value to be used
+/// @param stddev standard deviation of the kernel
+/// @return the sum of the kernel elements for normalization
+/// @ingroup gVision
+template <class T>
+T gaussianKernel(std::vector<T>& k, T maxval, double stddev)
+{
+    double sum = 0;
+    unsigned int i, argmax=0;
+    std::vector<double> kernel(k.size());
+    for (i=0;i<k.size();i++) {
+        double x = i +0.5 - k.size()/2.0;
+        sum += kernel[i] = exp(-x*x/(2*stddev*stddev));
+        if (kernel[i] > kernel[argmax])
+        argmax = i;
+    }
+    T finalSum = 0;
+    for (i=0;i<k.size();i++)
+    finalSum += k[i] = (T)(kernel[i]*maxval/kernel[argmax]);
+    return finalSum;
+}
+
+/// scales a GaussianKernel to a different maximum value. The new kernel is
+/// returned in scaled. The new normalizing value is returned.
+/// @param k input kernel
+/// @param scaled output vector to hold the resulting kernel
+/// @param maxval the new maximum value
+/// @return sum of the new kernel elements for normalization
+/// @ingroup gVision
+template <class S, class T>
+T scaleKernel(const std::vector<S>& k, std::vector<T>& scaled, T maxval)
+{
+    unsigned int i,argmax=0;
+    for (i=1;i<k.size();i++)
+        if (k[i]>k[argmax])
+            argmax = i;
+    scaled.resize(k.size());
+    T sum = 0;
+    for (i=0;i<k.size();i++)
+        sum += (scaled[i] = (T)((k[i]*maxval)/k[argmax]));
+    return sum;
+}
+
+template <class T>
+void convolveGaussian5_1(BasicImage<T>& I)
+{
+    int w = I.size().x;
+    int h = I.size().y;
+    int i,j;
+    for (j=0;j<w;j++) {
+        T* src = I.data()+j;
+        T* end = src + w*(h-4);
+        while (src != end) {
+            T sum= (T)(0.0544887*(src[0]+src[4*w])
+                    + 0.2442010*(src[w]+src[3*w])
+                    + 0.4026200*src[2*w]);
+            *(src) = sum;
+            src += w;
+        }
+    }
+    for (i=h-5;i>=0;i--) {
+        T* src = I.data()+i*w;
+        T* end = src + w-4;
+        while (src != end) {
+            T sum= (T)(0.0544887*(src[0]+src[4])
+                    + 0.2442010*(src[1]+src[3])
+                    + 0.4026200*src[2]);
+            *(src+2*w+2)=sum;
+            ++src;
+        }
+    }
+}
+
+namespace Exceptions {
+
+    /// %Exceptions specific to vision algorithms
+    /// @ingroup gException
+    namespace Convolution {
+        /// Base class for all Image_IO exceptions
+        /// @ingroup gException
+        struct All: public CVD::Exceptions::All {};
+
+        /// Input images have incompatible dimensions
+        /// @ingroup gException
+        struct IncompatibleImageSizes : public All {
+            IncompatibleImageSizes(const std::string & function)
+            {
+                what = "Incompatible image sizes in " + function;
+            };
+        };
+    }
+}
+//void convolveGaussian5_1(BasicImage<byte>& I);
+
+/// convolves an image with a box of given size.
+/// @param I input image, modified in place
+/// @param hwin window size, this is half of the box size
+/// @ingroup gVision
+template <class T> void convolveWithBox(const BasicImage<T>& I, BasicImage<T>& 
J, ImageRef hwin)
+{
+    typedef typename Pixel::traits<T>::wider_type sum_type;
+    if (I.size() != J.size()) {
+       throw 
Exceptions::Convolution::IncompatibleImageSizes("convolveWithBox");
+    }
+    int w = I.size().x;
+    int h = I.size().y;
+    ImageRef win = 2*hwin+ImageRef(1,1);
+    const double factor = 1.0/(win.x*win.y);
+    std::vector<sum_type> buffer(w*win.y);
+    std::vector<sum_type> sums_v(w);
+    sum_type* sums = &sums_v[0];
+    sum_type* next_row = &buffer[0];
+    sum_type* oldest_row = &buffer[0];
+    zeroPixels(sums, w);
+    const T* input = I.data();
+    T* output = J[hwin.y] - hwin.x;
+    for (int i=0; i<h; i++) {
+       sum_type hsum=sum_type();
+       const T* back = input;
+       int j;
+       for (j=0; j<win.x-1; j++)
+           hsum += input[j];
+       for (; j<w; j++) {
+           hsum += input[j];
+           next_row[j] = hsum;
+           sums[j] += hsum;
+           hsum -= *(back++);
+       }
+       if (i >= win.y-1) {
+           assign_multiple(sums+win.x-1, factor, output+win.x-1, w-win.x+1);
+           differences(sums+win.x-1, oldest_row+win.x-1, sums+win.x-1, 
w-win.x+1);
+           output += w;
+           oldest_row += w;
+           if (oldest_row == &buffer[0] + w*win.y)
+               oldest_row = &buffer[0];
+       }    
+       input += w;
+       next_row += w;
+       if (next_row == &buffer[0] + w*win.y)
+           next_row = &buffer[0];
+    }
+}
+
+template <class T> inline void convolveWithBox(const BasicImage<T>& I, 
BasicImage<T>& J, int hwin)
+{
+    convolveWithBox(I, J, ImageRef(hwin,hwin));
+}
+
+template <class T> inline void convolveWithBox(BasicImage<T>& I, int hwin) {
+    convolveWithBox(I,I,hwin);
+}
+
+template <class T> inline void convolveWithBox(BasicImage<T>& I, ImageRef 
hwin) {
+    convolveWithBox(I,I,hwin);
+}
+    
+
+template <class T, int A, int B, int C> void convolveSymmetric(Image<T>& I)
+  {
+    typedef typename Pixel::traits<T>::wider_type wider;
+    static const wider S = (A+B+C+B+A);
+    int width = I.size().x;
+    int height = I.size().y;
+    T* p = I.data();
+    int i,j;
+    for (i=0; i<height; i++) {
+      wider a = p[0];
+      wider b = p[1];
+      wider c = p[2];
+      wider d = p[3];
+      p[0] = (T)(((c+c)*A+(b+b)*B + a*C) /S);
+      p[1] = (T)(((b+d)*A+(a+c)*B + b*C) /S);
+      for (j=0;j<width-4;j++,p++) {
+        wider e = p[4];
+        p[2] = (T)(((a+e)*A + (b+d)*B + c*C)/S);
+        a = b; b = c; c = d; d = e;
+      }
+      p[2] = (T)(((a+c)*A + (b+d)*B + c*C) /S);
+      p[3] = (T)(((b+b)*A + (c+c)*B + d*C) /S);
+      p += 4;
+    }
+    for (j=0;j<width;j++) {
+      p = I.data()+j;
+      wider a = p[0];
+      wider b = p[width];
+      p[0] = (T)(((p[2*width]+p[2*width])*A+(b+b)*B + a*C) /S);
+      p[width] = (T)(((b+p[width*3])*A+(a+p[2*width])*B + b*C) /S);
+      for (i=0;i<height-4;i++) {
+        wider c = p[2*width];
+        p[2*width] = (T)(((a+p[4*width])*A + (b+p[3*width])*B + c*C)/S);
+        a=b; b=c;
+        p += width;
+      }
+      wider c = p[2*width];
+      p[2*width] = (T)(((a+c)*A + (b+p[width*3])*B + c*C) /S);
+      p[3*width] = (T)(((b+b)*A + (c+c)*B + p[width*3]*C) /S);
+    }
+  }
+
+  template <class T, int A, int B, int C, int D> void 
convolveSymmetric(Image<T>& I)
+  {
+    typedef typename Pixel::traits<T>::wider_type wider;
+    static const wider S = (A+B+C+D+C+B+A);
+    int width = I.size().x;
+    int height = I.size().y;
+    T* p = I.data();
+    int i,j;
+    for (i=0; i<height; i++) {
+      wider a = p[0];
+      wider b = p[1];
+      wider c = p[2];
+      wider d = p[3];
+      p[0] = (T)(((d+d)*A + (c+c)*B + (b+b)*C + a*D)/S);
+      p[1] = (T)(((c+p[4])*A + (b+d)*B + (a+c)*C + b*D)/S);
+      p[2] = (T)(((b+p[5])*A + (a+p[4])*B + (b+d)*C + c*D)/S);
+      for (j=0;j<width-6;j++,p++) {
+        d = p[3];
+        p[3] = (T)(((a+p[6])*A + (b+p[5])*B + (c+p[4])*C + d*D)/S);
+        a=b; b=c; c=d;
+      }
+      d = p[3];
+      wider e = p[4];
+      p[3] = (T)(((a+e)*A + (b+p[5])*B + (c+e)*C + d*D)/S);
+      p[4] = (T)(((b+d)*A + (c+e)*B + (d+p[5])*C + e*D)/S);
+      p[5] = (T)(((c+c)*A + (d+d)*B + (e+e)*C + p[5]*D)/S);
+      p += 6;
+    }
+    for (j=0;j<width;j++) {
+      p = I.data()+j;
+      wider a = p[0];
+      wider b = p[width];
+      wider c = p[2*width];
+      wider d = p[3*width];
+      p[0] = (T)(((d+d)*A + (c+c)*B + (b+b)*C + a*D)/S);
+      p[width] = (T)(((c+p[4*width])*A + (b+d)*B + (a+c)*C + b*D)/S);
+      p[2*width] = (T)(((b+p[5*width])*A + (a+p[4*width])*B + (b+d)*C + 
c*D)/S);
+      for (i=0;i<height-6;i++) {
+        d = p[3*width];
+        p[3*width] = (T)(((a+p[width*6])*A + (b+p[width*5])*B + 
(c+p[width*4])*C+d*D)/S);
+        a=b; b=c; c=d;
+        p += width;
+      }
+      d = p[3*width];
+      wider e = p[4*width];
+      p[3*width] = (T)(((a+e)*A + (b+p[5*width])*B + (c+e)*C + d*D)/S);
+      p[4*width] = (T)(((b+d)*A + (c+e)*B + (d+p[5*width])*C + e*D)/S);
+      p[5*width] = (T)(((c+c)*A + (d+d)*B + (e+e)*C + p[5*width]*D)/S);
+    }
+  }
+
+template <class T, class K> void convolveSeparableSymmetric(Image<T>& I, const 
std::vector<K>& kernel, K divisor)
+  {
+    typedef typename Pixel::traits<T>::wider_type sum_type;
+    int w = I.size().x;
+    int h = I.size().y;
+    int r = (int)kernel.size()/2;
+    int i,j;
+    int m;
+    double factor = 1.0/divisor;
+    for (j=0;j<w;j++) {
+      T* src = I.data()+j;
+      for (i=0; i<h-2*r; i++,src+=w) {
+        sum_type sum = src[r*w]*kernel[r], v;
+        for (m=0; m<r; m++)
+         sum += (src[m*w] + src[(2*r-m)*w]) * kernel[m];
+       *(src) = static_cast<T>(sum * factor);
+      }
+    }
+    int offset = r*w + r;
+    for (i=h-2*r-1;i>=0;i--) {
+      T* src = I[w];
+      for (j=0;j<w-2*r;j++, src++) {
+        sum_type sum = src[r] * kernel[r], v;
+        for (m=0; m<r; m++)
+         sum += (src[m] + src[2*r-m])*kernel[m];
+       *(src+offset) = static_cast<T>(sum*factor);
+      }
+    }
+  }
+  
+
+template <class A, class B> struct GetPixelRowTyped {
+  static inline const B* get(const A* row, int w, B* rowbuf) {
+    std::copy(row, row+w, rowbuf);
+    return rowbuf;
+  }
+};
+
+template <class T> struct GetPixelRowTyped<T,T> {
+  static inline const T* get(const T* row, int w, T* rowbuf) {
+    return row;
+  }
+};
+
+template <class A, class B> const B* getPixelRowTyped(const A* row, int n, B* 
rowbuf) {
+  return GetPixelRowTyped<A,B>::get(row,n,rowbuf);
+}
+
+template <class T, class S> struct CastCopy {
+  static inline void cast_copy(const T* from, S* to, int count) {
+    for (int i=0; i<count; i++)
+      to[i] = static_cast<S>(from[i]);
+  }
+};
+
+template <class T> struct CastCopy<T,T> {
+  static inline void cast_copy(const T* from, T* to, int count) {
+    std::copy(from, from+count, to);
+  }
+};
+
+template <class T, class S> inline void cast_copy(const T* from, S* to, int 
count) { CastCopy<T,S>::cast_copy(from,to,count); }
+
+template <class T, int N=-1, int C = Pixel::Component<T>::count> struct 
ConvolveMiddle {
+  template <class S> static inline T at(const T* input, const S& factor, const 
S* kernel) { return ConvolveMiddle<T,-1,C>::at(input,factor, kernel, N); }
+};
+
+template <class T, int N> struct ConvolveMiddle<T,N,1> {
+  template <class S> static inline T at(const T* input, const S& factor, const 
S* kernel) { return ConvolveMiddle<T,N-1>::at(input,factor, kernel) + 
(input[-N]+input[N])*kernel[N-1]; }
+};
+
+template <class T> struct ConvolveMiddle<T,-1,1> {
+  template <class S> static inline T at(const T* input, const S& factor, const 
S* kernel, int ksize) {
+    T hsum = *input * factor;
+    for (int k=0; k<ksize; k++)
+      hsum += (input[-k-1] + input[k+1]) * kernel[k];
+    return hsum;
+  }
+};
+
+template <class T, int C> struct ConvolveMiddle<T,-1, C> {
+  template <class S> static inline T at(const T* input, const S& factor, const 
S* kernel, int ksize) {
+    T hsum = *input * factor;
+    for (int k=0; k<ksize; k++)
+      hsum += (input[-k-1] + input[k+1]) * kernel[k];
+    return hsum;
+  }
+};
+
+template <class T> struct ConvolveMiddle<T,0,1> {
+  template <class S> static inline T at(const T* input, const S& factor, const 
S* kernel) { return *input * factor; }
+};
+
+#if 1
+template <class T,class S> inline const T* convolveMiddle(const T* input, 
const S& factor, const S* kernel, int ksize, int n, T* output) {
+#define CALL_CM(I) for (int j=0; j<n; j++, input++) { *(output++) = 
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+  switch (ksize) {
+  case 0: CALL_CM(0);
+  case 1: CALL_CM(1);
+  case 2: CALL_CM(2);
+  case 3: CALL_CM(3);
+  case 4: CALL_CM(4);
+  case 5: CALL_CM(5);
+  case 6: CALL_CM(6);
+  case 7: CALL_CM(7);
+  case 8: CALL_CM(8);
+  case 9: CALL_CM(9);
+  case 10: CALL_CM(10);
+  case 11: CALL_CM(11);
+  case 12: CALL_CM(12);
+  default: for (int j=0; j<n; j++, input++) { *(output++) = 
ConvolveMiddle<T,-1>::at(input, factor, kernel, ksize); }    
+  }
+  return input;
+#undef CALL_CM
+}
+
+#else
+
+template <class T,class S> const T* convolveMiddle(const T* input, const S& 
factor, const S* kernel, int ksize, int n, T* output) {
+    assign_multiple(input, factor, output, n);    
+    for (int r=0; r <ksize; r++) {
+       add_multiple_of_sum(input-r-1, input+r+1, kernel[r], output, n);
+    }
+    return input + n;
+}
+
+#endif
+
+template <class T> inline void convolveGaussian(BasicImage<T>& I, double 
sigma, double sigmas=3.0)
+{
+  convolveGaussian(I,I,sigma,sigmas);
+}
+
+template <class T> void convolveGaussian(const BasicImage<T>& I, 
BasicImage<T>& out, double sigma, double sigmas=3.0)
+{
+  typedef typename Pixel::traits<typename 
Pixel::Component<T>::type>::float_type sum_comp_type;
+  typedef typename Pixel::traits<T>::float_type sum_type;
+  assert(out.size() == I.size());
+  int ksize = (int)(sigmas*sigma + 0.5);
+  sum_comp_type kernel[ksize];
+  sum_comp_type ksum = sum_comp_type();
+  for (int i=1; i<=ksize; i++)
+    ksum += (kernel[i-1] = 
static_cast<sum_comp_type>(exp(-i*i/(2*sigma*sigma))));
+  for (int i=0; i<ksize; i++)
+    kernel[i] /= (2*ksum+1);
+  double factor = 1.0/(2*ksum+1);
+  int w = I.size().x;
+  int h = I.size().y;
+  int swin = 2*ksize;
+
+  sum_type* buffer = Internal::aligned_mem<sum_type,16>::alloc(w*(swin+1));
+  sum_type* rowbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+  sum_type* outbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+
+  sum_type* rows[swin+1];
+  for (int k=0;k<swin+1;k++)
+    rows[k] = buffer + k*w;
+
+  T* output = out.data();
+  for (int i=0; i<h; i++) {
+    sum_type* next_row = rows[swin];
+    const sum_type* input = getPixelRowTyped(I[i], w, rowbuf);
+    // beginning of row
+    for (int j=0; j<ksize; j++) {
+      sum_type hsum = input[j] * factor;
+      for (int k=0; k<ksize; k++)
+       hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+      next_row[j] = hsum;
+    }
+    // middle of row
+    input += ksize;
+    input = convolveMiddle<sum_type, sum_comp_type>(input, factor, kernel, 
ksize, w-swin, next_row+ksize);
+    // end of row
+    for (int j=w-ksize; j<w; j++, input++) {
+      sum_type hsum = *input * factor;
+      const int room = w-j;
+      for (int k=0; k<ksize; k++) {
+       hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] : input[k+1])) 
* kernel[k];
+      }
+      next_row[j] = hsum;
+    }
+    // vertical
+    if (i >= swin) {
+      const sum_type* middle_row = rows[ksize];
+      assign_multiple(middle_row, factor, outbuf, w);
+      for (int k=0; k<ksize; k++) {
+       const sum_comp_type m = kernel[k];
+       const sum_type* row1 = rows[ksize-k-1];
+       const sum_type* row2 = rows[ksize+k+1]; 
+       add_multiple_of_sum(row1, row2, m, outbuf, w);
+      }
+      cast_copy(outbuf, output, w);
+      output += w;
+      if (i == h-1) {
+       for (int r=0; r<ksize; r++) {
+         const sum_type* middle_row = rows[ksize+r+1];
+         assign_multiple(middle_row, factor, outbuf, w);
+         for (int k=0; k<ksize; k++) {
+           const sum_comp_type m = kernel[k];
+           const sum_type* row1 = rows[ksize+r-k];
+           const sum_type* row2 = rows[ksize+r+k+2 > swin ? 2*swin - 
(ksize+r+k+2) : ksize+r+k+2];
+           add_multiple_of_sum(row1, row2, m, outbuf, w);
+         }
+         cast_copy(outbuf, output, w);
+         output += w;
+       }       
+      }
+    } else if (i == swin-1) {
+      for (int r=0; r<ksize; r++) {
+       const sum_type* middle_row = rows[r+1];
+       assign_multiple(middle_row, factor, outbuf, w);
+       for (int k=0; k<ksize; k++) {
+         const sum_comp_type m = kernel[k];
+         const sum_type* row1 = rows[abs(r-k-1)+1];
+         const sum_type* row2 = rows[r+k+2];   
+         add_multiple_of_sum(row1, row2, m, outbuf, w);
+       }
+       cast_copy(outbuf, output, w);
+       output += w;
+      }
+    }
+    
+    sum_type* tmp = rows[0];
+    for (int r=0;r<swin; r++)
+      rows[r] = rows[r+1];
+    rows[swin] = tmp;
+  }
+
+  Internal::aligned_mem<sum_type,16>::release(buffer);
+  Internal::aligned_mem<sum_type,16>::release(rowbuf);
+  Internal::aligned_mem<sum_type,16>::release(outbuf);
+}
+
+template <class T, class O, class K> void convolve_gaussian_3(const 
BasicImage<T>& I, BasicImage<O>& out, K k1, K k2)
+{    
+    assert(I.size() == out.size());
+    const T* a=I.data();
+    const int w = I.size().x;
+    O* o = out.data()+w+1;
+    int total = I.totalsize() - 2*w-2;
+    const double cross = k1*k2;
+    k1 *= k1;
+    k2 *= k2;
+    while (total--) {
+       const double sum = k1*(a[0] + a[2] + a[w*2] + a[w*2+2]) + cross*(a[1] + 
a[w*2+1] + a[w] + a[w+2]) + k2*a[w+1];
+       *o++ = Pixel::scalar_convert<O,T,double>(sum);
+       ++a;
+    }
+}
+
+} // namespace CVD
+
+#endif
Index: libcvd/cvd/utility.h
diff -u /dev/null libcvd/cvd/utility.h:1.4.2.1
--- /dev/null   Wed May 24 00:38:31 2006
+++ libcvd/cvd/utility.h        Wed May 24 00:38:31 2006
@@ -0,0 +1,197 @@
+#ifndef CVD_UTILITY_H
+#define CVD_UTILITY_H
+
+#include <cvd/config.h>
+#include <cvd/image.h>
+#include <cvd/internal/is_pod.h>
+#include <cvd/internal/pixel_traits.h>
+#include <cvd/internal/convert_pixel_types.h>
+
+namespace CVD { //begin namespace
+
+  /** Generic image copy function for copying sub rectangles of images into
+  other images. This performs pixel type conversion if the input and output
+  images are different pixel types.
+  @param in input image to copy from
+  @param out output image to copy into
+  @param size size of the area to copy. By default this is the entirty of the
+input image
+  @param begin upper left corner of the area to copy, by default the upper left
+corner of the input image
+  @param dst upper left corner of the destination in the output image, by
+default the upper left corner of the output image
+  @throw ImageRefNotInImage if either begin is not in the input image or dst 
not
+in the output image
+  @ingroup gImageIO
+  */
+  template<class S, class T> void copy(const BasicImage<S>& in, BasicImage<T>&
+out, ImageRef size=ImageRef(-1,-1), ImageRef begin = ImageRef(), ImageRef dst =
+ImageRef())
+  {
+    if (size.x == -1 && size.y == -1)
+      size = in.size();
+    // FIXME: This should be an exception, but which one do I use? 
+    // I refuse to define another "ImageRefNotInImage" in this namespace.
+    if (!(in.in_image(begin) && out.in_image(dst) && in.in_image(begin+size - 
ImageRef(1,1)) && out.in_image(dst+size - ImageRef(1,1)))){      
+       std::cerr << "bad copy: " << in.size() << " " << out.size() << " " << 
size << " " << begin << " " << dst << std::endl;
+       int *p = 0;
+       *p = 1;
+    }
+    if (in.size() == out.size() && size == in.size() && begin == ImageRef() && 
dst == ImageRef()) {
+      Pixel::ConvertPixels<S,T>::convert(in.data(), out.data(), 
in.totalsize());
+      return;
+    }
+    
+    const S* from = &in[begin];
+    T* to = &out[dst];
+    int i = 0;
+    while (i++<size.y) {
+      Pixel::ConvertPixels<S,T>::convert(from, to, size.x);
+      from += in.size().x;
+      to += out.size().x;
+    }
+  }
+  
+  template <class T, bool pod = Internal::is_POD<T>::is_pod> struct ZeroPixel {
+      static void zero(T& t) { 
+         for (unsigned int c=0; c<Pixel::Component<T>::count; c++)
+             Pixel::Component<T>::get(t,c) = 0;
+      }
+  };
+  
+  template <class T> struct ZeroPixel<T,true> {
+      static void zero(T& t) { memset(&t,0,sizeof(T)); }
+  };
+  
+  template <class T, bool pod = Internal::is_POD<T>::is_pod> struct ZeroPixels 
{
+      static void zero(T* pixels, int count) {
+         if (count) {
+             ZeroPixel<T>::zero(*pixels);
+             std::fill(pixels+1, pixels+count, *pixels);
+         }
+      }
+  };
+
+  template <class T> struct ZeroPixels<T,true> {
+      static void zero(T* pixels, int count) {
+         memset(pixels, 0, sizeof(T)*count);
+      }
+  };
+  
+
+  /// Set a pixel to the default value (typically 0)
+  /// For multi-component pixels, this zeros all components (sets them to 
defaults)
+  template <class T> inline void zeroPixel(T& pixel) { 
ZeroPixel<T>::zero(pixel); }
+
+  /// Set many pixels to the default value (typically 0)
+  /// For multi-component pixels, this zeros all components (sets them to 
defaults)
+  template <class T> inline void zeroPixels(T* pixels, int count) {  
ZeroPixels<T>::zero(pixels, count);  }
+  
+  /// Set the one-pixel border (top, bottom, sides) of an image to zero values
+  template <class T> void zeroBorders(BasicImage<T>& I)
+  {
+    if (I.size().y == 0)
+      return;
+    zeroPixels(I[0], I.size().x);
+    for (int r=0;r<I.size().y-1; r++)
+       zeroPixels(I[r]+I.size().x-1,2);
+    zeroPixels(I[I.size().y-1], I.size().x);
+  }
+
+  /// Compute pointwise differences (a_i - b_i) and store in diff_i
+  /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
+  /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
+  template <class A, class B> inline void differences(const A* a, const A* b, 
B* diff, unsigned int count)
+  {
+      while (count--)
+         *(diff++) = (B)*(a++) - (B)*(b++);
+  }
+
+  /// Compute pointwise (a_i + b_i) * c and add to out_i
+  /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
+  /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
+  template <class A, class B> inline void add_multiple_of_sum(const A* a, 
const A* b, const A& c,  B* out, unsigned int count)
+  {
+      while (count--)
+         *(out++) += (*(a++) + *(b++)) * c;
+  }
+  
+  /// Compute pointwise a_i * c and store in out_i
+  /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
+  /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
+  template <class A, class B> inline void assign_multiple(const A* a, const A& 
c,  B* out, unsigned int count)
+  {
+      while (count--)
+         *(out++) = static_cast<B>(*(a++) * c);
+  }
+
+  /// Compute sum(a_i*b_i)
+  /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
+  /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
+  template <class T> double inner_product(const T* a, const T* b, unsigned int 
count) {
+      double dot = 0;
+      while (count--)
+         dot += *(a++) * *(b++);
+      return dot;
+  }
+
+  template <class R, class D, class T> struct SumSquaredDifferences {
+      static inline R sum_squared_differences(const T* a, const T* b, size_t 
count) {
+         R ssd = 0;
+         while (count--) {
+             D d = *a++ - *b++;
+             ssd += d*d;
+         }
+         return ssd;
+      }
+  };
+ 
+  /// Compute sum of (a_i - b_i)^2 (the SSD)
+  /// This is accelerated using SIMD for some platforms and data types 
(alignment is checked at runtime)
+  /// Do not specify template parameters explicitly so that overloading can 
choose the right implementation
+  template <class T> inline double sum_squared_differences(const T* a, const 
T* b, size_t count) {
+      return 
SumSquaredDifferences<double,double,T>::sum_squared_differences(a,b,count);
+  }
+  
+  /// Check if the pointer is aligned to the specified byte granularity
+  template<int bytes> bool is_aligned(const void* ptr);
+  template<> inline bool is_aligned<8>(const void* ptr) {   return 
((reinterpret_cast<size_t>(ptr)) & 0x7) == 0;   }
+  template<> inline bool is_aligned<16>(const void* ptr) {  return 
((reinterpret_cast<size_t>(ptr)) & 0xF) == 0;   }
+
+  /// Compute the number of pointer increments necessary to yield alignment of 
A bytes
+  template<int A, class T> inline size_t steps_to_align(const T* ptr) 
+  {
+      return is_aligned<A>(ptr) ? 0 : (A-((reinterpret_cast<size_t>(ptr)) & 
(A-1)))/sizeof(T); 
+  }
+
+#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_MMINTRIN)
+  void differences(const byte* a, const byte* b, short* diff, unsigned int 
size);
+  void differences(const short* a, const short* b, short* diff, unsigned int 
size);
+#endif  
+
+
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+  void differences(const float* a, const float* b, float* diff, unsigned int 
size);
+  void add_multiple_of_sum(const float* a, const float* b, const float& c,  
float* out, unsigned int count);
+  void assign_multiple(const float* a, const float& c,  float* out, unsigned 
int count);
+  double inner_product(const float* a, const float* b, unsigned int count);
+  double sum_squared_differences(const float* a, const float* b, size_t count);
+#endif
+
+#if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)
+  void differences(const int32_t* a, const int32_t* b, int32_t* diff, unsigned 
int size);
+  void differences(const double* a, const double* b, double* diff, unsigned 
int size);
+  void add_multiple_of_sum(const double* a, const double* b, const float& c,  
double* out, unsigned int count);
+  void assign_multiple(const double* a, const double& c,  double* out, 
unsigned int count);
+  double inner_product(const double* a, const double* b, unsigned int count);
+  double sum_squared_differences(const double* a, const double* b, size_t 
count);
+  long long sum_squared_differences(const byte* a, const byte* b, size_t 
count);
+#else  
+  inline long long sum_squared_differences(const byte* a, const byte* b, 
size_t count) {
+      return SumSquaredDifferences<long 
long,int,byte>::sum_squared_differences(a,b,count);
+  }
+#endif 
+  
+}
+
+#endif
Index: libcvd/cvd_src/utility.cc
diff -u /dev/null libcvd/cvd_src/utility.cc:1.5.2.1
--- /dev/null   Wed May 24 00:38:31 2006
+++ libcvd/cvd_src/utility.cc   Wed May 24 00:38:31 2006
@@ -0,0 +1,585 @@
+#include <cvd/config.h>
+#include <cvd/utility.h>
+// internal functions used by CVD vision algorithm implementations
+#include <cvd/internal/assembly.h>
+
+#if CVD_HAVE_MMINTRIN
+#    include <mmintrin.h>
+#endif
+
+#if CVD_HAVE_XMMINTRIN
+#    include <xmmintrin.h>
+#endif
+
+#if CVD_HAVE_EMMINTRIN
+#    include <emmintrin.h>
+#endif
+
+using namespace std;
+
+    using CVD::is_aligned;
+    using CVD::steps_to_align;
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_differences(const T1* a, const T1* b, T2* c, unsigned int count)
+    {
+       if (count < M*2) {
+           F::unaligned_differences(a,b,c,count);
+           return;
+       }
+       if (!is_aligned<A>(a)) {            
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_differences(a,b,c,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           c += steps;
+       }
+       if (!is_aligned<A>(c) || count < M) {
+           F::unaligned_differences(a,b,c,count);
+           return;
+       }       
+       unsigned int block = (count/M)*M;
+       F::aligned_differences(a,b,c,block);
+       if (count > block) {
+           F::unaligned_differences(a+block,b+block,c+block,count-block);
+       }
+    }    
+    
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, 
unsigned int count)
+    {
+       if (count < M*2) {
+           F::unaligned_add_mul_add(a,b,c,out,count);
+           return;
+       }
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_add_mul_add(a,b,c,out,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           out += steps;
+           if (count < M || !is_aligned<16>(out)) {
+               F::unaligned_add_mul_add(a,b,c,out,count);
+               return;
+           }
+       }
+       else if (count < M || !is_aligned<16>(out)) {
+           F::unaligned_add_mul_add(a,b,c,out,count);
+           return;
+       }
+       unsigned int block = (count/M)*M;
+       F::aligned_add_mul_add(a,b,c,out,block);
+       if (count > block)
+           F::unaligned_add_mul_add(a+block,b+block,c, out+block,count-block);
+    }    
+
+    template <class F, class T1, class T2, int A, int M> inline void 
maybe_aligned_assign_mul(const T1* a, const T1& c, T2* out, unsigned int count)
+    {
+       if (count < M*2) {
+           F::unaligned_assign_mul(a,c,out,count);
+           return;
+       }
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           F::unaligned_assign_mul(a,c,out,steps);
+           count -= steps;
+           a += steps;
+           out += steps;
+           if (count < M) {
+               F::unaligned_assign_mul(a,c,out,count);
+               return;
+           }
+       }
+       unsigned int block = (count/M)*M;
+       F::aligned_assign_mul(a,c,out,block);
+       if (count > block) {
+           F::unaligned_assign_mul(a+block,c, out+block,count-block);
+       }
+    }    
+
+    template <class F, class R, class T1, int A, int M> inline R 
maybe_aligned_inner_product(const T1* a, const T1* b, unsigned int count)
+    {
+       if (count < M*2) {
+           return F::unaligned_inner_product(a,b,count);
+       }
+       R sum = 0;
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           sum = F::unaligned_inner_product(a,b,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           if (count < M) {
+               return sum + F::unaligned_inner_product(a,b,count);
+           }
+       }
+       unsigned int block = (count/M)*M;
+       sum += F::aligned_inner_product(a,b,block);
+       if (count > block)
+           sum += F::unaligned_inner_product(a+block,b+block,count-block);
+       return sum;
+    }    
+
+    template <class F, class R, class T1, int A, int M> inline R 
maybe_aligned_ssd(const T1* a, const T1* b, unsigned int count)
+    {
+       if (count < M*2) {
+           return F::unaligned_ssd(a,b,count);
+       }
+       R sum = 0;
+       if (!is_aligned<A>(a)) {      
+           unsigned int steps = steps_to_align<A>(a);
+           sum = F::unaligned_ssd(a,b,steps);
+           count -= steps;
+           a += steps;
+           b += steps;
+           if (count < M) {
+               return sum + F::unaligned_ssd(a,b,count);
+           }
+       }
+       unsigned int block = (count/M)*M;
+       sum += F::aligned_ssd(a,b,block);
+       if (count > block)
+           sum += F::unaligned_ssd(a+block,b+block,count-block);
+       return sum;
+    }    
+
+
+
+namespace CVD {
+
+#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_MMINTRIN)
+
+
+    void byte_to_short_differences(const __m64* a, const __m64* b, __m64* 
diff, unsigned int count)
+    {
+       __m64 z = _mm_setzero_si64();
+       for (;count; --count, ++a, ++b, diff+=2) {
+           __m64 aq = *a;
+           __m64 bq = *b;
+           __m64 alo = _mm_unpacklo_pi8(aq,z);
+           __m64 ahi = _mm_unpackhi_pi8(aq,z);
+           __m64 blo = _mm_unpacklo_pi8(bq,z);
+           __m64 bhi = _mm_unpackhi_pi8(bq,z);
+           diff[0] = _mm_sub_pi16(alo,blo);
+           diff[1] = _mm_sub_pi16(ahi,bhi);
+       }
+       _mm_empty();
+    }
+
+    void short_differences(const __m64* a, const __m64* b, __m64* diff, 
unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_pi16(*(a++), *(b++));
+       }
+       _mm_empty();
+    }
+
+    
+    struct MMX_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+       static inline void aligned_differences(const byte* a, const byte* b, 
short* diff, unsigned int count) {
+           if (is_aligned<8>(b))
+               byte_to_short_differences((const __m64*)a,(const __m64*)b, 
(__m64*)diff, count>>3);
+           else
+               unaligned_differences(a,b,diff,count);
+       }
+       
+       static inline void aligned_differences(const short* a, const short* b, 
short* diff, unsigned int count) {
+           if (is_aligned<8>(b))           
+               short_differences((const __m64*)a, (const __m64*)b, 
(__m64*)diff, count>>2);
+           else
+               unaligned_differences(a,b,diff,count);
+       }
+    };
+
+    void differences(const byte* a, const byte* b, short* diff, unsigned int 
count) {
+       maybe_aligned_differences<MMX_funcs, byte, short, 8, 8>(a,b,diff,count);
+    }
+    
+    void differences(const short* a, const short* b, short* diff, unsigned int 
count) {
+       maybe_aligned_differences<MMX_funcs, short, short, 8, 
4>(a,b,diff,count);
+    }
+#endif
+
+
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+
+    template <bool Aligned> inline __m128 load_ps(const void* addr) { return 
_mm_loadu_ps((const float*)addr); }
+    template <> inline __m128 load_ps<true>(const void* addr) { return 
_mm_load_ps((const float*)addr); }
+
+    template <bool Aligned> inline void store_ps(__m128 m, void* addr) { 
return _mm_storeu_ps((float*)addr, m); }
+    template <> inline void store_ps<true>(__m128 m, void* addr) { return 
_mm_store_ps((float*)addr, m); }
+
+    template <bool Aligned_b> inline void float_differences(const __m128* a, 
const __m128* b, __m128* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+       }
+    }
+    
+    template <bool Aligned_b> void float_add_multiple_of_sum(const __m128* a, 
const __m128* b, const float& c, __m128* out, unsigned int count)
+    {
+       __m128 cccc = _mm_set1_ps(c);
+       while (count--) {
+           *out = _mm_add_ps(_mm_mul_ps(_mm_add_ps(*(a++), 
load_ps<Aligned_b>(b++)), cccc), *out);
+           ++out;
+       }
+    }
+
+    template <bool Aligned_out> inline void float_assign_multiple(const 
__m128* a, const float& c, __m128* out, unsigned int count)
+    {
+       const __m128 cccc = _mm_set1_ps(c);
+       while (count--)
+           store_ps<Aligned_out>(_mm_mul_ps(*(a++), cccc), out++);
+       
+    }
+    
+    template <bool Aligned_b> double float_inner_product(const __m128* a, 
const __m128* b, unsigned int count)
+    {
+       float sums_store[4];
+       const unsigned int BLOCK = 1<<10;
+       double dot = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128 sums = _mm_setzero_ps();
+           while (pass--) {
+               __m128 prod = _mm_mul_ps(*(a++), load_ps<Aligned_b>(b++));
+               sums = _mm_add_ps(prod, sums);
+           }
+           _mm_storeu_ps(sums_store, sums);
+           dot += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return dot;
+    }
+
+    template <bool Aligned_b> inline double 
float_sum_squared_differences(const __m128* a, const __m128* b, size_t count) 
+    {
+       float sums_store[4];
+       const size_t BLOCK = 1<<10;
+       double ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128 sums = _mm_setzero_ps();
+           while (pass--) {
+               __m128 diff = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+               sums = _mm_add_ps(_mm_mul_ps(diff,diff), sums);
+           }
+           _mm_storeu_ps(sums_store, sums);
+           ssd += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return ssd;
+    }
+
+    struct SSE_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+       
+       static inline void aligned_differences(const float* a, const float* b, 
float* diff, unsigned int count) {
+           if (is_aligned<16>(b))
+               float_differences<true>((const __m128*)a, (const __m128*)b, 
(__m128*)diff, count>>2);
+           else
+               float_differences<false>((const __m128*)a, (const __m128*)b, 
(__m128*)diff, count>>2);
+       }
+
+       template <class T1, class T2> static inline void 
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t 
count) {
+           add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+       }
+       static inline void aligned_add_mul_add(const float* a, const float* b, 
const float& c, float* out, size_t count) {
+           if (is_aligned<16>(b))
+               float_add_multiple_of_sum<true>((const __m128*)a, (const 
__m128*)b, c, (__m128*)out, count>>2);
+           else
+               float_add_multiple_of_sum<false>((const __m128*)a, (const 
__m128*)b, c, (__m128*)out, count>>2);
+       }       
+
+       template <class T1, class T2> static inline void 
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+           assign_multiple<T1,T2>(a,c,out,count);
+       }
+       static inline void aligned_assign_mul(const float* a, const float& c, 
float* out, size_t count) {
+           if (is_aligned<16>(out)) 
+               float_assign_multiple<false>((const __m128*)a, c, (__m128*)out, 
count>>2);
+           else                
+               float_assign_multiple<false>((const __m128*)a, c, (__m128*)out, 
count>>2);
+       }       
+
+       template <class T1> static inline double unaligned_inner_product(const 
T1* a, const T1* b, size_t count) {
+           return inner_product<T1>(a,b,count);
+       }
+       
+       static inline double aligned_inner_product(const float* a, const float* 
b, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return float_inner_product<true>((const __m128*) a, (const 
__m128*) b, count>>2);
+           else
+               return float_inner_product<false>((const __m128*) a, (const 
__m128*) b, count>>2);
+       }       
+
+       template <class T1> static inline double unaligned_ssd(const T1* a, 
const T1* b, size_t count) {
+           return sum_squared_differences<T1>(a,b,count);
+       }
+       
+       static inline double aligned_ssd(const float* a, const float* b, 
unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return float_sum_squared_differences<true>((const __m128*) a, 
(const __m128*) b, count>>2);
+           else
+               return float_sum_squared_differences<false>((const __m128*) a, 
(const __m128*) b, count>>2);
+       }       
+    };
+    
+    void differences(const float* a, const float* b, float* diff, unsigned int 
size)
+    {
+       maybe_aligned_differences<SSE_funcs, float, float, 16, 
4>(a,b,diff,size);
+    }
+    
+    void add_multiple_of_sum(const float* a, const float* b, const float& c,  
float* out, unsigned int count)
+    {
+       maybe_aligned_add_mul_add<SSE_funcs,float,float,16,4>(a,b,c,out,count);
+    }
+    
+    void assign_multiple(const float* a, const float& c,  float* out, unsigned 
int count) 
+    {
+       maybe_aligned_assign_mul<SSE_funcs,float,float,16,4>(a,c,out,count);
+    }
+
+    
+    double inner_product(const float* a, const float* b, unsigned int count) 
+    {
+       return 
maybe_aligned_inner_product<SSE_funcs,double,float,16,4>(a,b,count);
+    }
+
+    double sum_squared_differences(const float* a, const float* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE_funcs,double,float,16,4>(a,b,count);
+    }
+
+#endif
+
+#if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)
+
+
+    static inline __m128i zero_si128() { __m128i x; asm ( "pxor %0, %0  \n\t" 
: "=x"(x) ); return x; }
+    template <bool Aligned> inline __m128i load_si128(const void* addr) { 
return _mm_loadu_si128((const __m128i*)addr); }
+    template <> inline __m128i load_si128<true>(const void* addr) { return 
_mm_load_si128((const __m128i*)addr); }
+    template <bool Aligned> inline __m128d load_pd(const void* addr) { return 
_mm_loadu_pd((const double*)addr); }
+    template <> inline __m128d load_pd<true>(const void* addr) { return 
_mm_load_pd((const double*)addr); }
+
+    template <bool Aligned> inline void store_pd(__m128d m, void* addr) { 
return _mm_storeu_pd((double*)addr, m); }
+    template <> inline void store_pd<true>(__m128d m, void* addr) { return 
_mm_store_pd((double*)addr, m); }
+
+    template <bool Aligned_b> void int_differences(const __m128i* a, const 
__m128i* b, __m128i* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_epi32(*(a++), load_si128<Aligned_b>(b++));
+       }
+    }
+    
+    template <bool Aligned_b> void double_differences(const __m128d* a, const 
__m128d* b, __m128d* diff, unsigned int count)
+    {
+       while (count--) {
+           *(diff++) = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+       }
+    }
+
+    template <bool Aligned_b> void double_add_multiple_of_sum(const __m128d* 
a, const __m128d* b, const double& c, __m128d* out, unsigned int count)
+    {
+       __m128d cc = _mm_set1_pd(c);
+       while (count--) {
+           *out = _mm_add_pd(_mm_mul_pd(_mm_add_pd(*(a++), 
load_pd<Aligned_b>(b++)), cc), *out);
+           ++out;
+       }
+    }
+
+    template <bool Aligned_out> void double_assign_multiple(const __m128d* a, 
const double& c, __m128d* out, unsigned int count)
+    {
+       __m128d cc = _mm_set1_pd(c);
+       while (count--)
+           store_pd<Aligned_out>(_mm_mul_pd(*(a++), cc), out++);
+    }
+    template <bool Aligned_b> double double_inner_product(const __m128d* a, 
const __m128d* b, unsigned int count)
+    {
+       double sums_store[2];
+       const unsigned int BLOCK = 1<<16;
+       double dot = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128d sums = _mm_setzero_pd();
+           while (pass--) {
+               __m128d prod = _mm_mul_pd(*(a++), load_pd<Aligned_b>(b++));
+               sums = _mm_add_pd(prod, sums);
+           }
+           _mm_storeu_pd(sums_store, sums);
+           dot += sums_store[0] + sums_store[1];
+       }
+       return dot;
+    }
+
+    template <bool Aligned_b> long long byte_sum_squared_differences(const 
__m128i* a, const __m128i* b, unsigned int count) {
+       unsigned long sums_store[4];    
+       const unsigned int BLOCK = 1<<15;
+       long long ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count -= pass;
+           __m128i sums = _mm_setzero_si128();
+           while (pass--) {
+               __m128i lo_a = load_si128<true>(a++);
+               __m128i lo_b = load_si128<Aligned_b>(b++);
+               __m128i hi_a = _mm_unpackhi_epi8(lo_a, sums);
+               __m128i hi_b = _mm_unpackhi_epi8(lo_b, sums);
+               lo_a = _mm_unpacklo_epi8(lo_a, sums);
+               lo_b = _mm_unpacklo_epi8(lo_b, sums);
+               lo_a = _mm_sub_epi16(lo_a, lo_b);
+               hi_a = _mm_sub_epi16(hi_a, hi_b);
+               lo_a = _mm_madd_epi16(lo_a,lo_a);
+               hi_a = _mm_madd_epi16(hi_a,hi_a);
+               sums = _mm_add_epi32(_mm_add_epi32(lo_a, hi_a), sums);
+           }
+           _mm_storeu_si128((__m128i*)sums_store, sums);
+           ssd += sums_store[0] + sums_store[1] + sums_store[2] + 
sums_store[3];
+       }
+       return ssd;
+    }
+
+    template <bool Aligned_b> inline double 
double_sum_squared_differences(const __m128d* a, const __m128d* b, unsigned int 
count) 
+    {
+       double sums_store[2];
+       const unsigned int BLOCK = 1<<10;
+       double ssd = 0;
+       while (count) {
+           size_t pass = std::min(count, BLOCK);
+           count-=pass;
+           __m128d sums = _mm_setzero_pd();
+           while (pass--) {
+               __m128d diff = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+               sums = _mm_add_pd(_mm_mul_pd(diff,diff), sums);
+           }
+           _mm_storeu_pd(sums_store, sums);
+           ssd += sums_store[0] + sums_store[1];
+       }
+       return ssd;
+    }
+
+    
+    struct SSE2_funcs {
+       template <class T1, class T2> static inline void 
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+           differences<T1,T2>(a,b,diff,count);
+       }
+
+       static inline void aligned_differences(const int32_t* a, const int32_t* 
b, int32_t* diff, unsigned int count) {
+           if (is_aligned<16>(b))
+               int_differences<true>((const __m128i*)a, (const __m128i*)b, 
(__m128i*)diff, count>>2);
+           else
+               int_differences<false>((const __m128i*)a, (const __m128i*)b, 
(__m128i*)diff, count>>2);
+       }
+
+       static inline void aligned_differences(const double* a, const double* 
b, double* diff, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               double_differences<true>((const __m128d*)a,(const 
__m128d*)b,(__m128d*)diff,count>>1);
+           else
+               double_differences<false>((const __m128d*)a,(const 
__m128d*)b,(__m128d*)diff,count>>1);
+       }
+
+       template <class T1, class T2> static inline void 
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t 
count) {
+           add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+       }
+       
+       static inline void aligned_add_mul_add(const double* a, const double* 
b, const double& c, double* out, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               double_add_multiple_of_sum<true>((const __m128d*)a, (const 
__m128d*)b, c, (__m128d*)out, count>>1);
+           else
+               double_add_multiple_of_sum<false>((const __m128d*)a, (const 
__m128d*)b, c, (__m128d*)out, count>>1);
+       }
+       
+       template <class T1, class T2> static inline void 
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+           assign_multiple<T1,T2>(a,c,out,count);
+       }
+
+       static inline void aligned_assign_mul(const double* a, const double& c, 
double* out, unsigned int count)
+       {
+           if (is_aligned<16>(out))
+               double_assign_multiple<true>((const __m128d*)a, c, 
(__m128d*)out, count>>1);
+           else
+               double_assign_multiple<false>((const __m128d*)a, c, 
(__m128d*)out, count>>1);
+       }
+       
+       template <class T1> static inline double unaligned_inner_product(const 
T1* a, const T1* b, size_t count) {
+           return inner_product<T1>(a,b,count);
+       }
+       
+       static inline double aligned_inner_product(const double* a, const 
double* b, unsigned int count)
+       {
+           if (is_aligned<16>(b))
+               return double_inner_product<true>((const __m128d*) a, (const 
__m128d*) b, count>>1);
+           else
+               return double_inner_product<false>((const __m128d*) a, (const 
__m128d*) b, count>>1);
+       }
+
+       template <class T1> static inline double unaligned_ssd(const T1* a, 
const T1* b, size_t count) {
+           return sum_squared_differences<T1>(a,b,count);
+       }
+       
+       static inline long long unaligned_ssd(const byte* a, const byte* b, 
size_t count) {
+           return SumSquaredDifferences<long long, int, 
byte>::sum_squared_differences(a,b,count);
+       }
+
+       static inline double aligned_ssd(const double* a, const double* b, 
size_t count) 
+       {
+           if (is_aligned<16>(b))
+               return double_sum_squared_differences<true>((const __m128d*)a, 
(const __m128d*)b, count>>1);
+           else
+               return double_sum_squared_differences<false>((const __m128d*)a, 
(const __m128d*)b, count>>1);
+       }
+
+       static inline long long aligned_ssd(const byte* a, const byte* b, 
size_t count) 
+       {
+           if (is_aligned<16>(b)) 
+               return byte_sum_squared_differences<true>((const __m128i*)a, 
(const __m128i*)b, count>>4);
+           else
+               return byte_sum_squared_differences<false>((const __m128i*)a, 
(const __m128i*)b, count>>4);
+       }       
+    };
+
+    void differences(const int32_t* a, const int32_t* b, int32_t* diff, 
unsigned int size)
+    {
+       maybe_aligned_differences<SSE2_funcs, int32_t, int32_t, 16, 
4>(a,b,diff,size);
+    }
+
+    void differences(const double* a, const double* b, double* diff, unsigned 
int size)
+    {
+       maybe_aligned_differences<SSE2_funcs, double, double, 16, 
2>(a,b,diff,size);
+    }
+
+    void add_multiple_of_sum(const double* a, const double* b, const double& 
c,  double* out, unsigned int count)
+    {
+       maybe_aligned_add_mul_add<SSE2_funcs, double, double, 16, 
2>(a,b,c,out,count);
+    }
+
+    void assign_multiple(const double* a, const double& c,  double* out, 
unsigned int count)
+    {
+       maybe_aligned_assign_mul<SSE2_funcs, double, double, 16, 
2>(a,c,out,count);
+    }
+
+    double inner_product(const double* a, const double* b, unsigned int count)
+    {
+       return maybe_aligned_inner_product<SSE2_funcs, double, double, 16, 
2>(a,b,count);
+    }
+
+    double sum_squared_differences(const double* a, const double* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE2_funcs, double, double, 16, 2>(a,b,count);
+    }
+
+    long long sum_squared_differences(const byte* a, const byte* b, size_t 
count)
+    {
+       return maybe_aligned_ssd<SSE2_funcs, long long, byte, 16, 
16>(a,b,count); 
+    }
+#endif
+    
+}




reply via email to

[Prev in Thread] Current Thread [Next in Thread]