[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt

getfem-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/

From:	Yves . Renard
Subject:	[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/ src/gmm/
Date:	Fri, 21 Oct 2016 14:20:06 -0000
Author: renard
Date: Fri Oct 21 16:20:04 2016
New Revision: 5432

URL: http://svn.gna.org/viewcvs/getfem?rev=5432&view=rev
Log:
disabling blas interface by default (option --enable-blas-interface) and small 
optimizations

Modified:
    trunk/getfem/configure.ac
    trunk/getfem/contrib/opt_assembly/opt_assembly.cc
    trunk/getfem/src/bgeot_geometric_trans.cc
    trunk/getfem/src/getfem_generic_assembly.cc
    trunk/getfem/src/gmm/gmm_blas_interface.h

Modified: trunk/getfem/configure.ac
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/configure.ac?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/configure.ac   (original)
+++ trunk/getfem/configure.ac   Fri Oct 21 16:20:04 2016
@@ -309,6 +309,19 @@
 LIBS="$LIBS $BLAS_LIBS"
 CPPFLAGS="$CPPFLAGS -DGMM_USES_BLAS"
 
+useblasinterface=NO
+AC_ARG_ENABLE(blas_interface,
+ [AS_HELP_STRING([--enable-blas-interface],[enable the use of the blas call 
for basic algebra routines.])],
+ [case $enableval in
+   yes | "") useblasinterface=YES ;;
+   no) useblasinterface=NO ;;
+  esac],
+ [useblasinterface=NO]
+)
+
+if test x$useblasinterface = xYES; then
+  CPPFLAGS="$CPPFLAGS -DGMM_USES_BLAS_INTERFACE"
+fi
 
 dnl ---------------------------OPENMP------------------------------
 useopenmp=0

Modified: trunk/getfem/contrib/opt_assembly/opt_assembly.cc
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/contrib/opt_assembly/opt_assembly.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/contrib/opt_assembly/opt_assembly.cc   (original)
+++ trunk/getfem/contrib/opt_assembly/opt_assembly.cc   Fri Oct 21 16:20:04 2016
@@ -250,7 +250,7 @@
   
   bool all = false;
   bool select = true;
-  int only_one = 4;
+  int only_one = 5;
 
   if (all || only_one == 1) {
     VEC_TEST_1("Test for source term", ndofu, "u.Test_u", mim, size_type(-1),
@@ -438,40 +438,40 @@
   // - Instructions execution except for assembly ones
   //                        new  | old  | sto  | asse | exec | Ins  |
   test_new_assembly(2, 400, 1); // ndofu = 321602 ndofp = 160801 ndofchi = 1201
-  // Mass (scalar)        : 0.18 | 0.61 | 0.04 | 0.06 | 0.06 | 0.06 |
+  // Mass (scalar)        : 0.18 | 0.59 | 0.04 | 0.06 | 0.06 | 0.06 |
   // Mass (vector)        : 0.30 | 0.82 | 0.09 | 0.15 | 0.06 | 0.09 |
-  // Laplacian            : 0.16 | 0.83 | 0.04 | 0.05 | 0.06 | 0.05 |
+  // Laplacian            : 0.16 | 0.80 | 0.04 | 0.05 | 0.06 | 0.05 |
   // Homogeneous elas     : 0.31 | 1.88 | 0.08 | 0.13 | 0.06 | 0.10 |
   // Non-homogeneous elast: 0.37 | 2.26 | 0.09 | 0.16 | 0.06 | 0.15 |
   test_new_assembly(3, 36, 1);  // ndofu = 151959 ndofp =  50653 ndofchi = 6553
-  // Mass (scalar)        : 0.28 | 0.77 | 0.05 | 0.09 | 0.13 | 0.06 |
-  // Mass (vector)        : 0.76 | 1.54 | 0.17 | 0.29 | 0.13 | 0.34 |
-  // Laplacian            : 0.32 | 1.38 | 0.03 | 0.06 | 0.13 | 0.13 |
-  // Homogeneous elas     : 0.94 | 4.58 | 0.26 | 0.33 | 0.14 | 0.47 |
-  // Non-homogeneous elast: 1.01 | 6.72 | 0.26 | 0.33 | 0.14 | 0.54 |
+  // Mass (scalar)        : 0.27 | 0.75 | 0.05 | 0.08 | 0.13 | 0.06 |
+  // Mass (vector)        : 0.74 | 1.54 | 0.17 | 0.27 | 0.13 | 0.34 |
+  // Laplacian            : 0.29 | 1.37 | 0.03 | 0.06 | 0.13 | 0.10 |
+  // Homogeneous elas     : 0.91 | 4.58 | 0.26 | 0.33 | 0.13 | 0.45 |
+  // Non-homogeneous elast: 0.98 | 6.72 | 0.26 | 0.33 | 0.13 | 0.52 |
   test_new_assembly(2, 200, 2); // ndofu = 321602 ndofp = 160801 ndofchi = 1201
   // Mass (scalar)        : 0.09 | 0.25 | 0.02 | 0.03 | 0.03 | 0.03 |
   // Mass (vector)        : 0.26 | 0.44 | 0.05 | 0.09 | 0.03 | 0.14 |
   // Laplacian            : 0.09 | 0.37 | 0.02 | 0.03 | 0.03 | 0.03 |
   // Homogeneous elas     : 0.26 | 1.28 | 0.06 | 0.09 | 0.03 | 0.14 |
-  // Non-homogeneous elast: 0.31 | 2.40 | 0.07 | 0.10 | 0.03 | 0.18 |
+  // Non-homogeneous elast: 0.31 | 2.38 | 0.07 | 0.10 | 0.03 | 0.18 |
   test_new_assembly(3, 18, 2);  // ndofu = 151959 ndofp =  50653 ndofchi = 6553
-  // Mass (scalar)        : 0.13 | 0.29 | 0.05 | 0.07 | 0.03 | 0.03 |
-  // Mass (vector)        : 1.18 | 0.90 | 0.21 | 0.37 | 0.03 | 0.78 |
+  // Mass (scalar)        : 0.13 | 0.28 | 0.05 | 0.07 | 0.03 | 0.03 |
+  // Mass (vector)        : 1.15 | 0.90 | 0.21 | 0.35 | 0.03 | 0.77 |
   // Laplacian            : 0.11 | 0.55 | 0.03 | 0.05 | 0.03 | 0.05 |
-  // Homogeneous elas     : 1.70 | 3.47 | 0.59 | 0.73 | 0.03 | 0.94 |
-  // Non-homogeneous elast: 1.77 | 9.25 | 0.59 | 0.73 | 0.03 | 1.01 |
+  // Homogeneous elas     : 1.69 | 3.41 | 0.59 | 0.73 | 0.03 | 0.93 |
+  // Non-homogeneous elast: 1.76 | 9.24 | 0.59 | 0.73 | 0.03 | 1.00 |
   test_new_assembly(3, 9, 4);   // ndofu = 151959 ndofp =  50653 ndofchi = 6553
   // Mass (scalar)        : 0.52 | 0.34 | 0.09 | 0.16 | 0.01 | 0.34 |
-  // Mass (vector)        : 4.46 | 1.32 | 0.41 | 1.30 | 0.01 | 3.15 |
-  // Laplacian            : 0.41 | 0.77 | 0.09 | 0.15 | 0.01 | 0.25 |
-  // Homogeneous elas     : 8.99 | 5.26 | 0.88 | 1.43 | 0.01 | 7.55 |
-  // Non-homogeneous elast: 9.14 | 48.0 | 0.76 | 1.41 | 0.01 | 7.72 |
+  // Mass (vector)        : 4.38 | 1.31 | 0.41 | 1.27 | 0.01 | 3.10 |
+  // Laplacian            : 0.40 | 0.77 | 0.09 | 0.14 | 0.01 | 0.25 |
+  // Homogeneous elas     : 8.97 | 5.25 | 0.88 | 1.43 | 0.01 | 7.53 |
+  // Non-homogeneous elast: 9.01 | 47.7 | 0.76 | 1.35 | 0.01 | 7.65 |
 
   // Conclusions :
   // - Desactivation of debug test has no sensible effect.
   // - Compile time of assembly strings is negligible (< 0.0004)
-  // - J computation takes half the computational time of the exec part
+  // - (J, K, B) computation takes half the computational time of the exec part
   // - The optimized instruction call is negligible
   // - For uniform mesh_fem, the resize operations has been suppressed and
   //   the "update pfp" has been isolated in a set  of instruction being
@@ -484,11 +484,10 @@
   // - Optimization of J computation (especially in 2D and standard cases)
   // - Unroll loop in used instructions
   // - Assembly optimization
-  // - Detection of the very simples cases where the elementary atrix do not
+  // - Detection of the very simples cases where the elementary matrix do not
   //   have to be computed on each element (mass matrix, laplacian ...)
   //   on uniform mesh_fem and mesh_im ?
   // - storage optimization (matrices ...)
-  // - Fem interpolation context optimization.
   // - Why such a difference between mass matrix and laplacian for 3D and P2 ?
 
   // Original table :

Modified: trunk/getfem/src/bgeot_geometric_trans.cc
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/bgeot_geometric_trans.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/bgeot_geometric_trans.cc   (original)
+++ trunk/getfem/src/bgeot_geometric_trans.cc   Fri Oct 21 16:20:04 2016
@@ -35,16 +35,16 @@
     size_type N=gmm::mat_nrows(G), P=gmm::mat_nrows(pc), Q=gmm::mat_ncols(pc);
     if (N && P && Q) {
       auto itK = K.begin();
-      
-      for (size_type j = 0; j < Q; ++j)
-       for (size_type i = 0; i < N; ++i) {
-         auto itG = G.begin() + i;
-         auto itpc = pc.begin() + j*P;
-         register scalar_type a = *(itG) * (*itpc++); itG += N;
-         for (size_type k = 1; k < P; ++k, itG += N)
-           a += *(itG) * (*itpc++);
+      for (size_type j = 0; j < Q; ++j) {
+       auto itpc_j = pc.begin() + j*P, itG_b = G.begin();
+       for (size_type i = 0; i < N; ++i, ++itG_b) {
+         auto itG = itG_b, itpc = itpc_j;
+         register scalar_type a = *(itG) * (*itpc);
+         for (size_type k = 1; k < P; ++k)
+           { itG += N; a += *(itG) * (*++itpc); }
          *itK++ = a;
        }
+      }
     } else gmm::clear(K);
   }
 

Modified: trunk/getfem/src/getfem_generic_assembly.cc
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/getfem_generic_assembly.cc?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/getfem_generic_assembly.cc (original)
+++ trunk/getfem/src/getfem_generic_assembly.cc Fri Oct 21 16:20:04 2016
@@ -3289,25 +3289,32 @@
       size_type target_dim = Z.sizes()[1];
       size_type Qmult = qdim / target_dim;
       if (Qmult == 1) {
-        gmm::copy(Z.as_vector(), t.as_vector());
+       auto itt = Z.begin(); auto it = t.begin(), ite = t.end();
+       size_type nd = ((t.size()) >> 2);
+       for (size_type i = 0; i < nd; ++i) {
+         *it++ = (*itt++); *it++ = (*itt++);
+         *it++ = (*itt++); *it++ = (*itt++);
+       }
+       for (; it != ite;) *it++ = (*itt++);
+        // gmm::copy(Z.as_vector(), t.as_vector());
       } else {
         size_type ndof = Z.sizes()[0];
         GA_DEBUG_ASSERT(t.size() == Z.size() * Qmult * Qmult,
                         "Wrong size for base vector");
         gmm::clear(t.as_vector());
-        base_tensor::const_iterator itZ = Z.begin();
-        size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
-        
-        // Performs t(i*Qmult+j, k*Qmult + j) = Z(i,k);
-        for (size_type k = 0; k < target_dim; ++k) {
-          base_tensor::iterator it = t.begin() + (ss * k);
-          for (size_type i = 0; i < ndof; ++i, ++itZ) {
-            if (i) it += Qmult;
-            base_tensor::iterator it2 = it;
-            *it2 = *itZ;
-            for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ; }
-          }
-        }
+       auto itZ = Z.begin();
+       size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
+       
+       // Performs t(i*Qmult+j, k*Qmult + j) = Z(i,k);
+       for (size_type k = 0; k < target_dim; ++k) {
+         auto it = t.begin() + (ss * k);
+         
+         for (size_type i = 0; i < ndof; ++i, ++itZ, it += Qmult) {
+           auto it2 = it;
+           *it2 = *itZ;
+           for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ; }
+         }
+       }
       }
       return 0;
     }
@@ -3323,7 +3330,14 @@
       size_type target_dim = Z.sizes()[1];
       size_type Qmult = qdim / target_dim;
       if (Qmult == 1) {
-        gmm::copy(Z.as_vector(), t.as_vector());
+       auto itt = Z.begin(); auto it = t.begin(), ite = t.end();
+       size_type nd = ((t.size()) >> 2);
+       for (size_type i = 0; i < nd; ++i) {
+         *it++ = (*itt++); *it++ = (*itt++);
+         *it++ = (*itt++); *it++ = (*itt++);
+       }
+       for (; it != ite;) *it++ = (*itt++);
+        // gmm::copy(Z.as_vector(), t.as_vector());
       } else {
         size_type ndof = Z.sizes()[0];
         size_type N = Z.sizes()[2];
@@ -3332,14 +3346,13 @@
         gmm::clear(t.as_vector());
         base_tensor::const_iterator itZ = Z.begin();
         size_type s = t.sizes()[0], ss = s * Qmult, sss = s+1;
-        size_type ssss=ss*target_dim;
+        size_type ssss = ss*target_dim;
 
         // Performs t(i*Qmult+j, k*Qmult + j, l) = Z(i,k,l);
         for (size_type l = 0; l < N; ++l)
           for (size_type k = 0; k < target_dim; ++k) {
             base_tensor::iterator it = t.begin() + (ss * k + ssss*l);
-            for (size_type i = 0; i < ndof; ++i, ++itZ) {
-              if (i) it += Qmult;
+            for (size_type i = 0; i < ndof; ++i, ++itZ, it += Qmult) {
               base_tensor::iterator it2 = it;
               *it2 = *itZ;
               for (size_type j = 1; j < Qmult; ++j) { it2 += sss; *it2 = *itZ; 
}
@@ -4046,7 +4059,15 @@
     const base_tensor &tc1;
     virtual int exec() {
       GA_DEBUG_INFO("Instruction: tensor copy");
-      gmm::copy(tc1.as_vector(), t.as_vector());
+      
+      auto itt = tc1.begin(); auto it = t.begin(), ite = t.end();
+      size_type nd = ((t.size()) >> 2);
+      for (size_type i = 0; i < nd; ++i) {
+       *it++ = (*itt++); *it++ = (*itt++);
+       *it++ = (*itt++); *it++ = (*itt++);
+      }
+      for (; it != ite;) *it++ = (*itt++);
+      // gmm::copy(tc1.as_vector(), t.as_vector());
       return 0;
     }
     ga_instruction_copy_tensor(base_tensor &t_, const base_tensor &tc1_)
@@ -5414,9 +5435,23 @@
       GA_DEBUG_INFO("Instruction: vector term assembly for fem variable");
       if (ipt == 0 || interpolate) {
         elem.resize(t.size());
-        gmm::copy(gmm::scaled(t.as_vector(), coeff), elem);
+       auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
+       size_type nd = ((t.size()) >> 2);
+       for (size_type i = 0; i < nd; ++i) {
+         *it++ = (*itt++) * coeff; *it++ = (*itt++) * coeff;
+         *it++ = (*itt++) * coeff; *it++ = (*itt++) * coeff;
+       }
+       for (; it != ite;) *it++ = (*itt++) * coeff;
+        // gmm::copy(gmm::scaled(t.as_vector(), coeff), elem);
       } else {
-        gmm::add(gmm::scaled(t.as_vector(), coeff), elem);
+       auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
+       size_type nd = ((t.size()) >> 2);
+       for (size_type i = 0; i < nd; ++i) {
+         *it++ += (*itt++) * coeff; *it++ += (*itt++) * coeff;
+         *it++ += (*itt++) * coeff; *it++ += (*itt++) * coeff;
+       }
+       for (; it != ite;) *it++ += (*itt++) * coeff;
+        // gmm::add(gmm::scaled(t.as_vector(), coeff), elem);
       }
       if (ipt == nbpt-1 || interpolate) {
         const mesh_fem &mf = *(mfg ? *mfg : mfn);
@@ -5597,7 +5632,7 @@
         elem.resize(t.size());
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 2);
        for (size_type i = 0; i < nd; ++i) {
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
@@ -5608,7 +5643,7 @@
        // Faster than a daxpy blas call on my config
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 2);
        for (size_type i = 0; i < nd; ++i) {
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
@@ -5724,7 +5759,7 @@
         elem.resize(t.size());
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 2);
        for (size_type i = 0; i < nd; ++i) {
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
@@ -5735,7 +5770,7 @@
        // Faster than a daxpy blas call on my config
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 2);
        for (size_type i = 0; i < nd; ++i) {
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
@@ -5811,8 +5846,10 @@
         elem.resize(t.size());
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 3);
        for (size_type i = 0; i < nd; ++i) {
+         *it++ = (*itt++) * e; *it++ = (*itt++) * e;
+         *it++ = (*itt++) * e; *it++ = (*itt++) * e;
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
          *it++ = (*itt++) * e; *it++ = (*itt++) * e;
        }
@@ -5822,8 +5859,10 @@
        // (Far) faster than a daxpy blas call on my config.
        auto itt = t.begin(); auto it = elem.begin(), ite = elem.end();
        scalar_type e = coeff*alpha1*alpha2;
-       size_type nd = t.size() / 4;
+       size_type nd = ((t.size()) >> 3);
        for (size_type i = 0; i < nd; ++i) {
+         *it++ += (*itt++) * e; *it++ += (*itt++) * e;
+         *it++ += (*itt++) * e; *it++ += (*itt++) * e;
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
          *it++ += (*itt++) * e; *it++ += (*itt++) * e;
        }

Modified: trunk/getfem/src/gmm/gmm_blas_interface.h
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/gmm/gmm_blas_interface.h?rev=5432&r1=5431&r2=5432&view=diff
==============================================================================
--- trunk/getfem/src/gmm/gmm_blas_interface.h   (original)
+++ trunk/getfem/src/gmm/gmm_blas_interface.h   Fri Oct 21 16:20:04 2016
@@ -46,6 +46,10 @@
 #include "gmm_matrix.h"
 
 namespace gmm {
+
+  // Due to poor performance of level 1 and 2 blas on tested configurations,
+  // this interface is deactivated by default.
+  // Use ./configure --enable-blas-interface to activate it.
 
 #define GMMLAPACK_TRACE(f) 
   // #define GMMLAPACK_TRACE(f) cout << "function " << f << " called" << endl;
@@ -168,6 +172,8 @@
     void  sger_(...); void  dger_(...); void  cgerc_(...); void  zgerc_(...); 
   }
 
+#if defined(GMM_USES_BLAS_INTERFACE)
+
   /* ********************************************************************* */
   /* vect_norm2(x).                                                        */
   /* ********************************************************************* */
@@ -936,6 +942,7 @@
   trsv_interface(upper_tri_solve, trsv_lower, gem_p1_c, gem_trans1_c,
                 ztrsv_, BLAS_Z)
   
+#endif
 }
 
 #endif // GMM_BLAS_INTERFACE_H
[Prev in Thread]
Current Thread
[Next in Thread]
[Getfem-commits] r5432 - in /trunk/getfem: ./ contrib/opt_assembly/ src/ src/gmm/, Yves . Renard <=
Prev by Date: [Getfem-commits] r5430 - /trunk/getfem/src/getfem_generic_assembly.cc
Next by Date: [Getfem-commits] r5433 - /trunk/getfem/src/getfem_generic_assembly.cc
Previous by thread: [Getfem-commits] r5430 - /trunk/getfem/src/getfem_generic_assembly.cc
Next by thread: [Getfem-commits] r5433 - /trunk/getfem/src/getfem_generic_assembly.cc
Index(es):
- Date
- Thread