gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master f3dbf89: Match can also output the non-matched


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master f3dbf89: Match can also output the non-matched rows
Date: Tue, 5 Dec 2017 08:25:04 -0500 (EST)

branch: master
commit f3dbf89f6fe3cc0a8e06b080f89f792ede5b857b
Author: Mohammad Akhlaghi <address@hidden>
Commit: Mohammad Akhlaghi <address@hidden>

    Match can also output the non-matched rows
    
    Until now, the output of Match were only the matching rows. Therfore both
    output tables had the same number of rows. But in many contexts (especially
    when you want to check the match parameters), it is also necessary to
    inspect the rows that don't match.
    
    With this commit the new `--notmatched' option has been added to Match,
    with this option, the output tables will contain the rows that don't
    match.
---
 bin/match/args.h  | 14 ++++++++++
 bin/match/main.h  |  1 +
 bin/match/match.c | 36 ++++++++++++++++++++++++-
 bin/match/ui.h    |  9 ++++---
 doc/gnuastro.texi | 78 +++++++++++++++++++++++++++++++------------------------
 5 files changed, 99 insertions(+), 39 deletions(-)

diff --git a/bin/match/args.h b/bin/match/args.h
index ab95b57..9f780cc 100644
--- a/bin/match/args.h
+++ b/bin/match/args.h
@@ -115,6 +115,20 @@ struct argp_option program_options[] =
       GAL_OPTIONS_NOT_SET,
       gal_options_parse_csv_float64
     },
+    {
+      "notmatched",
+      UI_KEY_NOTMATCHED,
+      0,
+      0,
+      "Output is rows that don't match.",
+      UI_GROUP_CATALOGMATCH,
+      &p->notmatched,
+      GAL_OPTIONS_NO_ARG_TYPE,
+      GAL_OPTIONS_RANGE_0_OR_1,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET
+    },
+
 
 
     {0}
diff --git a/bin/match/main.h b/bin/match/main.h
index 6d065f7..1419dfa 100644
--- a/bin/match/main.h
+++ b/bin/match/main.h
@@ -57,6 +57,7 @@ struct matchparams
   gal_data_t           *ccol2;  /* Array of second input column names.  */
   gal_data_t        *aperture;  /* Acceptable matching aperture.        */
   uint8_t         logasoutput;  /* Don't rearrange inputs, out is log.  */
+  uint8_t          notmatched;  /* Output is rows that don't match.     */
 
   /* Internal */
   int                    mode;  /* Mode of operation: image or catalog. */
diff --git a/bin/match/match.c b/bin/match/match.c
index 403c625..a8f4c23 100644
--- a/bin/match/match.c
+++ b/bin/match/match.c
@@ -47,11 +47,14 @@ match_catalog_write(struct matchparams *p, char *filename, 
char *hdu,
                     size_t *permutation, size_t nummatched, char *outname,
                     char *extname)
 {
+  size_t origsize;
   gal_data_t *tmp, *cat;
+  gal_list_void_t *arrays=NULL;
 
   /* Read the full table. */
   cat=gal_table_read(filename, hdu, NULL,p->cp.searchin, p->cp.ignorecase,
                      p->cp.minmapsize);
+  origsize=cat->size;
 
   /* Go over each column and permute its contents. */
   for(tmp=cat; tmp!=NULL; tmp=tmp->next)
@@ -62,12 +65,43 @@ match_catalog_write(struct matchparams *p, char *filename, 
char *hdu,
       /* Correct the size of the array so only the matching columns are
          saved as output. This is only Gnuastro's convention, it has no
          effect on later freeing of the array in the memory. */
-      tmp->size=tmp->dsize[0]=nummatched;
+      if(p->notmatched)
+        {
+          /* Add the original array pointer to a list (we need to reset it
+             later). */
+          gal_list_void_add(&arrays, tmp->array);
+
+          /* Reset the data structure's array element to start where the
+             non-matched elements start. */
+          tmp->array=gal_data_ptr_increment(tmp->array, nummatched,
+                                            tmp->type);
+
+          /* Correct the size of the tile. */
+          tmp->size = tmp->dsize[0] = tmp->size - nummatched;
+        }
+      else
+        tmp->size=tmp->dsize[0]=nummatched;
     }
 
   /* Write the catalog to the output. */
   gal_table_write(cat, NULL, p->cp.tableformat, outname, extname);
 
+  /* Correct arrays and sizes (when `notmatched' was called). The `array'
+     element has to be corrected for later freeing. */
+  if(p->notmatched)
+    {
+      /* Reverse the list of array pointers to write them back in. */
+      gal_list_void_reverse(&arrays);
+
+      /* Correct the array and size pointers. */
+      for(tmp=cat; tmp!=NULL; tmp=tmp->next)
+        {
+          tmp->array=gal_list_void_pop(&arrays);
+          tmp->size=tmp->dsize[0]=origsize;
+          tmp->block=NULL;
+        }
+    }
+
   /* Clean up. */
   gal_list_data_free(cat);
 }
diff --git a/bin/match/ui.h b/bin/match/ui.h
index d5d9d7e..6c70f47 100644
--- a/bin/match/ui.h
+++ b/bin/match/ui.h
@@ -40,8 +40,8 @@ enum program_args_groups
 
 /* Available letters for short options:
 
-   b c d e f g i j k m n p r s t u v w x y z
-   A B C E G J L O Q R W X Y
+   b d e f g i j k m n p r s t u v w x y z
+   A B E G J L O Q R W X Y
 */
 enum option_keys_enum
 {
@@ -49,11 +49,12 @@ enum option_keys_enum
   UI_KEY_HDU2            = 'H',
   UI_KEY_APERTURE        = 'a',
   UI_KEY_LOGASOUTPUT     = 'l',
+  UI_KEY_CCOL1           = 'c',
+  UI_KEY_CCOL2           = 'C',
 
   /* Only with long version (start with a value 1000, the rest will be set
      automatically). */
-  UI_KEY_CCOL1           = 1000,
-  UI_KEY_CCOL2,
+  UI_KEY_NOTMATCHED      = 1000,
 };
 
 
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index be75c4c..2a8af30 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -15698,14 +15698,21 @@ description above. When this option is called, a log 
file called
 @file{astmatch.txt} will not be created. With this option, the default
 output behavior (two tables containing the re-arranged inputs) will be
 
address@hidden --ccol1=INT/STR[,INT/STR]
address@hidden --notmatched
+Write the non-matching rows into the outputs, not the matched ones. Note
+that with this option, the two output tables will not necessarily have the
+same number of rows.
+
address@hidden -c INT/STR[,INT/STR]
address@hidden --ccol1=INT/STR[,INT/STR]
 The coordinate columns of the first input. The number of dimensions for the
 match is determined by the number of comma-separated values given to this
 option. The values can be the column number (counting from 1), exact column
 name or a regular expression. For more, see @ref{Selecting table
 columns}. See the one-line examples above for some usages of this option.
 
address@hidden --ccol2=INT/STR
address@hidden -c INT/STR[,INT/STR]
address@hidden --ccol2=INT/STR[,INT/STR]
 The coordinate columns of the second input. See the example in
 @option{--ccol1} for more.
 
@@ -24420,26 +24427,26 @@ complicated, the speed of Python scripts 
significantly decrease. So when
 the program doesn't change too often and is widely used in a large
 community, mostly on large data sets (like astronomical images), using
 Python will waste a lot of valuable research-hours. It is possible to wrap
-C or C++ functions with Python to fix the speed issue. But this adds to
-complexity, because the interested scientist will now have to master two
+C or C++ functions with Python to fix the speed issue. But this creates
+further complexity, because the interested scientist has to master two
 programming languages and their connection (which is not trivial).
 
 Like C++, Python is object oriented, so as explained above, it needs a high
-level of experience with that particular program to fully understand its
-inner workings. To make things worse, since it is mainly for fast and on
-the go address@hidden that Python is good for fast programming,
-not fast programs.}, it can undergo significant changes. One recent example
-is how Python 2.x and Python 3.x are not compatible. Lots of research teams
+level of experience with that particular program to reasonably understand
+its inner workings. To make things worse, since it is mainly for on-the-go
address@hidden that Python is good for fast programming, not
+fast programs.}, it can undergo significant changes. One recent example is
+how Python 2.x and Python 3.x are not compatible. Lots of research teams
 that invested heavily in Python 2.x cannot benefit from Python 3.x or
 future versions any more. Some converters are available, but since they are
-automatic, lots of complications might arise in the conversion. Thus,
-re-writing all the changes would be the only truly reliable option. If a
-research project begins using Python 3.x today, there is no telling how
-compatible their investments will be when Python 4.x or 5.x will come
-out. This stems from the core principles of Python, which are very useful
-when you look in the `on the go' basis as described before and not future
-usage. Future-proof code (as long as current operating systems will be
-used) is written in C.
+automatic, lots of complications might arise in the address@hidden
+example see @url{https://arxiv.org/abs/1712.00461, Jenness (2017)} which
+describes how LSST is managing the transition.}. If a research project
+begins using Python 3.x today, there is no telling how compatible their
+investments will be when Python 4.x or 5.x will come out. This stems from
+the core principles of Python, which are very useful when you look in the
+`on the go' basis as described before and not future usage. Future-proof
+code (as long as current operating systems will be used) is written in C.
 
 The portability of C is best demonstrated by the fact that both C++ and
 Python are part of the C-family of programming languages which also include
@@ -24494,11 +24501,14 @@ anything particular to the GNU C library is used in 
the processing
 functions, it is explained in the comments in between the code.
 
 @cindex GNU Coreutils
-Similar to GNU Coreutils, all the Gnuastro programs provide very low level
-operations. This enables you to use shell scripting languages (for example
-GNU Bash) to operate on a large number of files or do very complex things
-through the creative combinations of these tools that the authors had never
-dreamed of. We have put a few simple examples in @ref{Tutorials}.
+All the Gnuastro programs provide very low level and modular operations
+(modeled on GNU Coreutils). Almost all the basic command-line programs like
address@hidden, @command{cp} or @command{rm} on GNU/Linux operting systems
+are part of GNU Coreutils. This enables you to use shell scripting
+languages (for example GNU Bash) to operate on a large number of files or
+do very complex things through the creative combinations of these tools
+that the authors had never dreamed of. We have put a few simple examples in
address@hidden
 
 @cindex @LaTeX{}
 @cindex GNU Bash
@@ -24512,24 +24522,24 @@ check your results. If you want to include the plots 
in a document, you can
 use the PGFplots package within @LaTeX{}, no attempt is made to include
 such operations in Gnuastro. In short, Bash can act as a glue to connect
 the inputs and outputs of all these various Gnuastro programs (and other
-programs) in any fashion you please. Of course, Gnuastro's programs are just
+programs) in any fashion. Of course, Gnuastro's programs are just
 front-ends to the main workhorse (@ref{Gnuastro library}), allowing a user
 to create their own programs (for example with @ref{BuildProgram}). So once
 the functions within programs become mature enough, they will be moved
-within the libraries for more general applications.
+within the libraries for even more general applications.
 
 The advantage of this architecture is that the programs become small and
 transparent: the starting and finishing point of every program is clearly
-demarcated. For nearly all operations on a modern computer, the read/write
-speed is very insignificant compared to the actual processing a program
-does. Therefore the complexity which arises from sharing memory in a large
-application is simply not worth the speed gain. Gnuastro's design is
-heavily influenced from Eric Raymond's ``The Art of Unix
-Programming''@footnote{Eric S. Raymond, 2004, @emph{The Art of Unix
-Programming}, Addison-Wesley Professional Computing Series.}  which
-beautifully describes the design philosophy and practice which lead to the
-success of Unix-based operating address@hidden principle: Keep It
-Simple, Stupid!}.
+demarcated. For nearly all operations on a modern computer (fast file
+input-output) with a modest level of complexity, the read/write speed is
+insignificant compared to the actual processing a program does. Therefore
+the complexity which arises from sharing memory in a large application is
+simply not worth the speed gain. Gnuastro's design is heavily influenced
+from Eric Raymond's ``The Art of Unix Programming''@footnote{Eric
+S. Raymond, 2004, @emph{The Art of Unix Programming}, Addison-Wesley
+Professional Computing Series.}  which beautifully describes the design
+philosophy and practice which lead to the success of Unix-based operating
address@hidden principle: Keep It Simple, Stupid!}.
 
 
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]