gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master 95f41d4: Function to find outliers using flat


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master 95f41d4: Function to find outliers using flat cumulative frequency plot
Date: Sat, 5 Jan 2019 17:50:55 -0500 (EST)

branch: master
commit 95f41d48fe72280b11ffb191a2bd718d3b80e744
Author: Mohammad Akhlaghi <address@hidden>
Commit: Mohammad Akhlaghi <address@hidden>

    Function to find outliers using flat cumulative frequency plot
    
    A new outlier identification algorithm has been defined that uses a
    sufficiently wide flat portion of the cumulative frequency plot. It is
    described in detail within the book.
---
 NEWS                      |  2 ++
 doc/gnuastro.texi         | 39 +++++++++++++++++++++
 lib/gnuastro/statistics.h |  5 +++
 lib/statistics.c          | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+)

diff --git a/NEWS b/NEWS
index 0a8f6de..4823c2c 100644
--- a/NEWS
+++ b/NEWS
@@ -31,6 +31,8 @@ GNU Astronomy Utilities NEWS                          -*- 
outline -*-
   Library:
     GAL_BLANK_LONG: new macro for the `long' type blank value.
     GAL_BLANK_ULONG: new macro for the `unsigned long' type blank value.
+    gal_statistics_outlier_cumulative: Uses flatness of the cumulative
+       distribution to find outliers.
 
 ** Removed features
 
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index cb157db..66b62b9 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -28293,7 +28293,46 @@ difference between the third and fourth, divided by 
the fifth.
 
 @end deftypefun
 
address@hidden {gal_data_t *} gal_statistics_outlier_flat_cfp (gal_data_t 
@code{*input}, size_t @code{dist}, float @code{thresh}, float @code{width}, int 
@code{inplace}, int @code{quiet}, size_t @code{*index})
+
+Return the first element in the given dataset where the cumulative
+frequency plot first becomes flat (below a given threshold on the slope)
+for a sufficient width (percentage of whole dataset's range). The returned
+dataset only has one element (with the same type as the input). If
address@hidden, the index (counting from zero, after sorting dataset
+and removing any blanks) is written in the space that @code{index} points
+to. If no sufficiently flat portion is found, the returned pointer will be
address@hidden
 
+The operation of this function can be best visualized using the cumulative
+frequency plot (CFP, see @ref{Histogram and Cumulative Frequency
+Plot}). Imagine setting the horizontal axis of the input's CFP to a range
+of 0 to 100, and finding the first part where its slope is
+constantly/contiguously flat for a certain fraction/width of the whole
+dataset's range. Below we'll descibe this in more detail.
+
+This function will first remove all the blank elements and sort the
+remaining elements. If @code{inplace} is non-zero this step will be done in
+place: re-organizing/permuting the input dataset. Using @code{inplace} can
+result in faster processing and less RAM consumption, but only when the
+input dataset is no longer needed in its original permutation.
+
+The input dataset will then be internally scaled from 0 to 100 (so
address@hidden and @code{width} can be in units of percent). For each
+element, this function will then estimate the slope of the cumulative
+distribution function by using the @code{dist} elements before and after
+it. In other words, if @mymath{d} is the value of @code{dist}, then the
+slope over the @mymath{i}'th element (@mymath{a_i}) is estimated using this
+formula: @mymath{1/(a_{i+d}-a_{i-d})}. Therefore, when this slope is larger
+than 1, the distance between the two checked points is smaller than
address@hidden of the dataset's range.
+
+All points that have a slope less than @code{thresh} are then marked. The
+first (parsing from the smallest to the largest values) contiguous patch of
+marked elements that is larger than @code{width} wide (in units of
+percentage of the dataset's range) will be considered as the boundary
+region of outliers and the first element in that patch will be returned.
address@hidden deftypefun
 
 
 @node Binary datasets, Labeled datasets, Statistical operations, Gnuastro 
library
diff --git a/lib/gnuastro/statistics.h b/lib/gnuastro/statistics.h
index ded7540..7eeff6a 100644
--- a/lib/gnuastro/statistics.h
+++ b/lib/gnuastro/statistics.h
@@ -178,6 +178,11 @@ gal_statistics_outlier_positive(gal_data_t *input, size_t 
window_size,
                                 float sigma, float sigclip_multip,
                                 float sigclip_param, int inplace, int quiet);
 
+gal_data_t *
+gal_statistics_outlier_flat_cfp(gal_data_t *input, size_t dist, float thresh,
+                                float width, int inplace, int quiet,
+                                size_t *index);
+
 
 
 __END_C_DECLS    /* From C++ preparations */
diff --git a/lib/statistics.c b/lib/statistics.c
index 61888e1..1b45453 100644
--- a/lib/statistics.c
+++ b/lib/statistics.c
@@ -2192,3 +2192,92 @@ gal_statistics_outlier_positive(gal_data_t *input, 
size_t window_size,
   if(nbs!=input) gal_data_free(nbs);
   return out;
 }
+
+
+
+
+
+
+/* Find the outliers using the first flat portion of the cumulative
+   frequency plot. A good value for window and thresh are 5 and 1.0. */
+#define OUTLIER_FLAT_CFP_BYTYPE(IT) {                                   \
+    IT m, n, *a=nbs->array, *p=a+dist, *pp=a+nbs->size-dist;            \
+    min=a[0]; max=a[nbs->size-1];                                       \
+    do                                                                  \
+      {                                                                 \
+        m=( *(p-dist)-min ) / (max-min) * 100.0f;                       \
+        n=( *(p+dist)-min ) / (max-min) * 100.0f;                       \
+        check=1/(n-m);                                                  \
+        if(!quiet) printf("%-6zu%-15g%-15g\n", p-a, (float)(*p), check); \
+        if( check<thresh && start==GAL_BLANK_SIZE_T )                   \
+          { start=p-a; if(!quiet) printf("\t---start\n"); }             \
+        if( check>thresh && start!=GAL_BLANK_SIZE_T )                   \
+          {                                                             \
+             widthcheck=100.0 * (double)(*p-a[start])/(max-min);        \
+             if(!quiet) printf("\t----end: %f\n", widthcheck);          \
+             if( widthcheck > width ) { flatind=start; break; }         \
+             start=GAL_BLANK_SIZE_T;                                    \
+          }                                                             \
+      }                                                                 \
+    while(++p<pp);                                                      \
+    if( pp==p && start!=GAL_BLANK_SIZE_T )                              \
+      {                                                                 \
+        widthcheck=100.0 * (double)(*(p-1)-a[start])/(max-min);         \
+        if(!quiet) printf("\t----end: %f\n", widthcheck);               \
+        if( widthcheck > width ) flatind=start;                         \
+      }                                                                 \
+  }
+
+gal_data_t *
+gal_statistics_outlier_flat_cfp(gal_data_t *input, size_t dist,
+                                float thresh, float width, int inplace,
+                                int quiet, size_t *index)
+{
+  gal_data_t  *nbs, *out=NULL;
+  double min, max, check, widthcheck;
+  size_t one=1, flatind=GAL_BLANK_SIZE_T, start=GAL_BLANK_SIZE_T;
+
+  /* Sanity checks. */
+  if(thresh<=0 || width<=0)
+    error(EXIT_FAILURE, 0, "%s: the values to `thresh' (%g) and `width' (%g) "
+          "must be positive", __func__, thresh, width);
+  if(width==0)
+    error(EXIT_FAILURE, 0, "%s: `dist' (%zu) cannot be zero", __func__,
+          dist);
+
+  /* Remove all blanks and sort the dataset. */
+  nbs=gal_statistics_no_blank_sorted(input, inplace);
+
+  /* Find the index where the distribution becomes sufficiently flat. */
+  switch(nbs->type)
+    {
+    case GAL_TYPE_UINT8:   OUTLIER_FLAT_CFP_BYTYPE( uint8_t  ); break;
+    case GAL_TYPE_INT8:    OUTLIER_FLAT_CFP_BYTYPE( int8_t   ); break;
+    case GAL_TYPE_UINT16:  OUTLIER_FLAT_CFP_BYTYPE( uint16_t ); break;
+    case GAL_TYPE_INT16:   OUTLIER_FLAT_CFP_BYTYPE( int16_t  ); break;
+    case GAL_TYPE_UINT32:  OUTLIER_FLAT_CFP_BYTYPE( uint32_t ); break;
+    case GAL_TYPE_INT32:   OUTLIER_FLAT_CFP_BYTYPE( int32_t  ); break;
+    case GAL_TYPE_UINT64:  OUTLIER_FLAT_CFP_BYTYPE( uint64_t ); break;
+    case GAL_TYPE_INT64:   OUTLIER_FLAT_CFP_BYTYPE( int64_t  ); break;
+    case GAL_TYPE_FLOAT32: OUTLIER_FLAT_CFP_BYTYPE( float    ); break;
+    case GAL_TYPE_FLOAT64: OUTLIER_FLAT_CFP_BYTYPE( double   ); break;
+    default:
+      error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
+            __func__, nbs->type);
+    }
+
+  /* Write the output dataset. */
+  if(flatind!=GAL_BLANK_SIZE_T)
+    {
+      out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1,
+                         NULL, NULL, NULL);
+      memcpy(out->array,
+             gal_pointer_increment(nbs->array, flatind, nbs->type),
+             gal_type_sizeof(nbs->type));
+    }
+
+  /* Clean up and return. */
+  if(nbs!=input) gal_data_free(nbs);
+  if(index) *index=flatind;
+  return out;
+}



reply via email to

[Prev in Thread] Current Thread [Next in Thread]