gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master a7bfa5b 11/19: Match: k-d tree matching UI fix


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master a7bfa5b 11/19: Match: k-d tree matching UI fixed, to do: finalizing docs and tests
Date: Sun, 14 Nov 2021 20:40:59 -0500 (EST)

branch: master
commit a7bfa5ba545056b68e9a594224d57b76506a5433
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>

    Match: k-d tree matching UI fixed, to do: finalizing docs and tests
    
    Until now, while most low-level aspects of the k-d tree based matching were
    done, the high-level aspects and in paritcular the documentation wasn't
    done yet.
    
    With this commit, the UI now works in the various scenarios and the
    documentation is mostly complete (in describing the algorithms). However,
    the options are still not added and no tests have been added to 'make
    check' yet.
---
 bin/match/args.h                           |  15 ++-
 bin/match/astmatch.conf                    |   2 +
 bin/match/main.h                           |   6 +-
 bin/match/match.c                          | 147 ++++++++++++++++++++---------
 bin/match/ui.c                             | 144 +++++++++++++++++++++++++---
 bin/match/ui.h                             |   1 +
 doc/gnuastro.texi                          | 113 +++++++++++++++++-----
 during-dev-test-data/kdtree-input.fits     | Bin 8640 -> 0 bytes
 during-dev-test-data/match-query.txt       |  10 --
 during-dev-test-data/scripts/gen-inputs.sh |  34 +++++++
 10 files changed, 379 insertions(+), 93 deletions(-)

diff --git a/bin/match/args.h b/bin/match/args.h
index d567c0f..9297d1b 100644
--- a/bin/match/args.h
+++ b/bin/match/args.h
@@ -161,7 +161,7 @@ struct argp_option program_options[] =
       UI_KEY_KDTREE,
       "STR",
       0,
-      "build, internal, automatic, none, FILE.",
+      "build, internal, disable, CUSTOM-FITS-FILE.",
       UI_GROUP_CATALOGMATCH,
       &p->kdtree,
       GAL_TYPE_STRING,
@@ -169,6 +169,19 @@ struct argp_option program_options[] =
       GAL_OPTIONS_NOT_MANDATORY,
       GAL_OPTIONS_NOT_SET,
     },
+    {
+      "kdtreehdu",
+      UI_KEY_KDTREEHDU,
+      "STR",
+      0,
+      "HDU of k-d tree when --kdtree=FILE.fits.",
+      UI_GROUP_CATALOGMATCH,
+      &p->kdtreehdu,
+      GAL_TYPE_STRING,
+      GAL_OPTIONS_RANGE_ANY,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET,
+    },
 
 
 
diff --git a/bin/match/astmatch.conf b/bin/match/astmatch.conf
index 6e64203..2b89ea6 100644
--- a/bin/match/astmatch.conf
+++ b/bin/match/astmatch.conf
@@ -21,5 +21,7 @@
 
 # Input
  hdu2            1
+ kdtreehdu       1
 
 # Catalog matching
+ kdtree   internal
diff --git a/bin/match/main.h b/bin/match/main.h
index 9084357..bd958d2 100644
--- a/bin/match/main.h
+++ b/bin/match/main.h
@@ -33,6 +33,8 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 #define PROGRAM_EXEC   "astmatch" /* Program executable name. */
 #define PROGRAM_STRING PROGRAM_NAME" (" PACKAGE_NAME ") " PACKAGE_VERSION
 
+/* Internal constants */
+#define MATCH_KDTREE_ROOT_KEY "KDTROOT"
 
 
 enum match_modes
@@ -47,7 +49,6 @@ enum kdtree_modes
   MATCH_KDTREE_INVALID,           /* ==0 by default. */
   MATCH_KDTREE_BUILD,
   MATCH_KDTREE_INTERNAL,
-  MATCH_KDTREE_AUTO,
   MATCH_KDTREE_DISABLE,
   MATCH_KDTREE_FILE,
 };
@@ -68,6 +69,7 @@ struct matchparams
   gal_data_t         *outcols;  /* Array of second input column names.  */
   gal_data_t        *aperture;  /* Acceptable matching aperture.        */
   char                *kdtree;  /* The mode to use k-d tree mode.       */
+  char             *kdtreehdu;  /* k-d tree HDU when its a (FITS) file. */
   uint8_t         logasoutput;  /* Don't rearrange inputs, out is log.  */
   uint8_t          notmatched;  /* Output is rows that don't match.     */
 
@@ -84,6 +86,8 @@ struct matchparams
   char              *out2name;  /* Name of second matched output.       */
   gal_list_str_t  *stdinlines;  /* Lines given by Standard input.       */
   int              kdtreemode;  /* The k-d tree mode.                   */
+  gal_data_t      *kdtreedata;
+  size_t           kdtreeroot;
 
   /* Output: */
   time_t              rawtime;  /* Starting time of the program.        */
diff --git a/bin/match/match.c b/bin/match/match.c
index c398146..aaf53dc 100644
--- a/bin/match/match.c
+++ b/bin/match/match.c
@@ -36,6 +36,7 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 #include <gnuastro/threads.h>
 #include <gnuastro/permutation.h>
 
+#include <gnuastro-internal/timing.h>
 #include <gnuastro-internal/checkset.h>
 
 #include <main.h>
@@ -453,44 +454,42 @@ match_catalog_write_one_col(struct matchparams *p, 
gal_data_t *a,
 static void
 match_catalog_kdtree_build(struct matchparams *p)
 {
+  char *msg;
   size_t root;
+  struct timeval t1;
   gal_data_t *kdtree;
   gal_fits_list_key_t *keylist=NULL;
 
   /* Meta-data in the output fits file. */
   char *unit = "index";
-  char *keyname = "KDTROOT";
   char *comment = "k-d tree root index (counting from 0).";
 
   /* Construct a k-d tree from 'p->cols1': the index of root is stored in
      'root'. */
+  if(!p->cp.quiet) gettimeofday(&t1, NULL);
   kdtree = gal_kdtree_create(p->cols1, &root);
+  if(!p->cp.quiet)
+    {
+      if( asprintf(&msg, "k-d tree constructed (%zu rows).", p->cols1->size)<0 
)
+        error(EXIT_FAILURE, errno, "asprintf allocation");
+      gal_timing_report(&t1, msg, 1);
+      free(msg);
+    }
 
   /* Write the k-d tree to a file and write root index and input name
      as FITS keywords ('gal_table_write' frees 'keylist'). */
   gal_fits_key_list_title_add(&keylist, "k-d tree parameters", 0);
-  gal_fits_key_write_filename("KDTIN", p->input1name, &keylist, 0);
-  gal_fits_key_list_add_end(&keylist, GAL_TYPE_SIZE_T, keyname, 0,
+  gal_fits_key_write_filename("KDTIN", p->input1name, &keylist, 0,
+                              p->cp.quiet);
+  gal_fits_key_list_add_end(&keylist, GAL_TYPE_SIZE_T,
+                            MATCH_KDTREE_ROOT_KEY, 0,
                             &root, 0, comment, 0, unit, 0);
   gal_table_write(kdtree, &keylist, NULL, GAL_TABLE_FORMAT_BFITS,
                   p->out1name, "kdtree", 0);
 
   /* Let the user know that the k-d tree has been built. */
   if(!p->cp.quiet)
-    fprintf(stdout, "Output (k-d tree): %s\n", p->out1name);
-}
-
-
-
-
-
-/* See if a k-d tree should be used (MATCH_KDTREE_INTERNAL) or classical
-   matching (MATCH_KDTREE_DISABLE). */
-static int
-match_catalog_kdtree_auto(struct matchparams *p)
-{
-  error(EXIT_FAILURE, 0, "%s: not yet implemented!", __func__);
-  return MATCH_KDTREE_INVALID;
+    fprintf(stdout, "  - Output (k-d tree): %s\n", p->out1name);
 }
 
 
@@ -502,15 +501,9 @@ match_catalog_kdtree_auto(struct matchparams *p)
 static gal_data_t *
 match_catalog_kdtree(struct matchparams *p, size_t *nummatched)
 {
-  size_t root;
+  char *msg;
+  struct timeval t1;
   gal_data_t *out=NULL;
-  gal_data_t *kdtree=NULL;
-
-  /* If we are in automatic mode, we should look at the data (number of
-     rows/columns) and system (number of threads) to decide if the mode
-     should be set to 'MATCH_KDTREE_INTERNAL' or 'MATCH_KDTREE_DISABLE'. */
-  if(p->kdtreemode==MATCH_KDTREE_AUTO)
-    p->kdtreemode=match_catalog_kdtree_auto(p);
 
   /* Operate according to the required mode. */
   switch(p->kdtreemode)
@@ -521,13 +514,38 @@ match_catalog_kdtree(struct matchparams *p, size_t 
*nummatched)
       break;
 
     /* Do the k-d tree matching. */
+    case MATCH_KDTREE_FILE:
     case MATCH_KDTREE_INTERNAL:
-      kdtree = gal_kdtree_create(p->cols1, &root);
-      out = gal_match_kdtree(p->cols1, p->cols2, kdtree, root,
-                             p->aperture->array, p->cp.numthreads,
-                             p->cp.minmapsize, p->cp.quietmmap,
-                             nummatched);
-      gal_list_data_free(kdtree);
+
+      /* If the k-d tree should be constructed internally, build it,
+         otherwise, we have already read an checked the k-d tree in 'ui.c',
+         so go directly to the matching. */
+      if(p->kdtreemode==MATCH_KDTREE_INTERNAL)
+        {
+          if(!p->cp.quiet) gettimeofday(&t1, NULL);
+          p->kdtreedata = gal_kdtree_create(p->cols1, &p->kdtreeroot);
+          if(!p->cp.quiet)
+            gal_timing_report(&t1, "Internal k-d tree constructed.", 1);
+        }
+
+      /* Do k-d tree based match. */
+      if(!p->cp.quiet)
+        {
+          gettimeofday(&t1, NULL);
+          printf("  - Match using the k-d tree ...\n");
+        }
+      out = gal_match_kdtree(p->cols1, p->cols2, p->kdtreedata,
+                             p->kdtreeroot, p->aperture->array,
+                             p->cp.numthreads, p->cp.minmapsize,
+                             p->cp.quietmmap, nummatched);
+      if(!p->cp.quiet)
+        {
+          if( asprintf(&msg, "... done (%zu matches found).", *nummatched)<0 )
+            error(EXIT_FAILURE, errno, "asprintf allocation");
+          gal_timing_report(&t1, msg, 1);
+          free(msg);
+        }
+      gal_list_data_free(p->kdtreedata);
       break;
 
     /* No 'default' necessary because the modes include disabling. */
@@ -541,12 +559,48 @@ match_catalog_kdtree(struct matchparams *p, size_t 
*nummatched)
 
 
 
+static gal_data_t *
+match_catalog_sort_based(struct matchparams *p, size_t *nummatched)
+{
+  char *msg;
+  gal_data_t *mcols;
+  struct timeval t1;
+
+  /* Let the user know that the matching has started. */
+  if(!p->cp.quiet)
+    {
+      gettimeofday(&t1, NULL);
+      printf("  - Matching by sorting ...\n");
+    }
+
+  /* Do the matching. */
+  mcols=gal_match_sort_based(p->cols1, p->cols2, p->aperture->array,
+                             0, 1, p->cp.minmapsize, p->cp.quietmmap,
+                             nummatched);
+
+  /* Let the user know that it finished. */
+  if(!p->cp.quiet)
+    {
+      if( asprintf(&msg, "... done (%zu matches found).", *nummatched)<0 )
+        error(EXIT_FAILURE, errno, "asprintf allocation");
+      gal_timing_report(&t1, msg, 1);
+      free(msg);
+    }
+
+  /* Return the permutations. */
+  return mcols;
+}
+
+
+
+
+
 static void
 match_catalog(struct matchparams *p)
 {
   uint32_t *u, *uf;
-  gal_data_t *tmp, *mcols;
-  gal_data_t *a=NULL, *b=NULL;
+  struct timeval t1;
+  gal_data_t *tmp, *a=NULL, *b=NULL, *mcols=NULL;
   size_t nummatched, *acolmatch=NULL, *bcolmatch=NULL;
 
   /* If we want to use kd-tree for matching. */
@@ -560,18 +614,22 @@ match_catalog(struct matchparams *p)
       if(p->kdtreemode==MATCH_KDTREE_BUILD) return;
     }
 
-  /* Find the matching coordinates. We are doing the processing in
-     place. Incase it was decided not to use a k-d tree at all
-     (in 'automatic' mode), then we need to use the classic mode. */
+  /* If k-d tree-based matching wasn't used, use the sort-based
+     matching. */
   if(mcols==NULL)
-    mcols=gal_match_sort_based(p->cols1, p->cols2, p->aperture->array,
-                                0, 1, p->cp.minmapsize, p->cp.quietmmap,
-                                &nummatched);
+    mcols=match_catalog_sort_based(p, &nummatched);
 
   /* If the output is to be taken from the input columns (it isn't just the
      log), then do the job. */
   if(p->logasoutput==0)
     {
+      /* Let the user know what is happening. */
+      if(!p->cp.quiet)
+        {
+          gettimeofday(&t1, NULL);
+          printf("  - Re-arranging matched rows (skip this with 
'--logasoutput')...\n");
+        }
+
       /* Read (and possibly write) the outputs. Note that we only need to
          read the table when it is necessary for the output (the user might
          have asked for '--outcols', only with columns of one of the two
@@ -604,6 +662,10 @@ match_catalog(struct matchparams *p)
       /* Clean up. */
       if(a) gal_list_data_free(a);
       if(b) gal_list_data_free(b);
+
+      /* Let the user know. */
+      if( !p->cp.quiet )
+        gal_timing_report(&t1, "... done.", 1);
     }
 
   /* Write the raw information in a log file if necessary.  */
@@ -655,12 +717,11 @@ match_catalog(struct matchparams *p)
   /* Print the number of matches if not in quiet mode. */
   if(!p->cp.quiet)
     {
-      fprintf(stdout, "Number of matching rows in both catalogs: %zu\n",
-              nummatched);
       if(p->out2name && strcmp(p->out1name, p->out2name))
-        fprintf(stdout, "Output:\n %s\n %s\n", p->out1name, p->out2name);
+        fprintf(stdout, "  - Output-1: %s\n  - Output-2: %s\n",
+                p->out1name, p->out2name);
       else
-        fprintf(stdout, "Output: %s\n", p->out1name);
+        fprintf(stdout, "  - Output: %s\n", p->out1name);
     }
 }
 
diff --git a/bin/match/ui.c b/bin/match/ui.c
index bc4eb70..f260dba 100644
--- a/bin/match/ui.c
+++ b/bin/match/ui.c
@@ -29,6 +29,7 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 #include <string.h>
 
 #include <gnuastro/fits.h>
+#include <gnuastro/threads.h>
 
 #include <gnuastro-internal/timing.h>
 #include <gnuastro-internal/options.h>
@@ -107,6 +108,7 @@ ui_initialize_options(struct matchparams *p,
   cp->program_exec       = PROGRAM_EXEC;
   cp->program_bibtex     = PROGRAM_BIBTEX;
   cp->program_authors    = PROGRAM_AUTHORS;
+  cp->numthreads         = gal_threads_number();
   cp->coptions           = gal_commonopts_options;
 
 
@@ -120,7 +122,6 @@ ui_initialize_options(struct matchparams *p,
           cp->coptions[i].doc="Extension name or number of first input.";
           break;
         case GAL_OPTIONS_KEY_TYPE:
-        case GAL_OPTIONS_KEY_NUMTHREADS:
           cp->coptions[i].flags=OPTION_HIDDEN;
           break;
         }
@@ -223,17 +224,15 @@ ui_read_check_only_options(struct matchparams *p)
     /* Set the k-d tree mode. */
     if(      !strcmp(p->kdtree,"build")    ) p->kdtreemode=MATCH_KDTREE_BUILD;
     else if( !strcmp(p->kdtree,"internal") ) 
p->kdtreemode=MATCH_KDTREE_INTERNAL;
-    else if( !strcmp(p->kdtree,"auto")     ) p->kdtreemode=MATCH_KDTREE_AUTO;
     else if( !strcmp(p->kdtree,"disable")  ) 
p->kdtreemode=MATCH_KDTREE_DISABLE;
     else if( gal_fits_name_is_fits(p->kdtree) ) 
p->kdtreemode=MATCH_KDTREE_FILE;
     else
       error(EXIT_FAILURE, 0, "'%s' is not valid for '--kdtree'. The "
             "following values are accepted: 'build' (to build the k-d tree in "
             "the file given to '--output'), 'internal' (to force internal "
-            "usage of a k-d tree for the matching), 'auto' (to decide "
-            "automatically if a k-d tree should be used or not), 'disable' "
-            "(to not use a k-d tree at all), a FITS file name (the file to "
-            "read a created k-d tree from)", p->kdtree);
+            "usage of a k-d tree for the matching), 'disable' (to not use a "
+            "k-d tree at all), a FITS file name (the file to read a created "
+            "k-d tree from)", p->kdtree);
 
     /* Make sure that the k-d tree build mode is not called with
        '--outcols'. */
@@ -243,6 +242,17 @@ ui_read_check_only_options(struct matchparams *p)
             "tree building mode doesn't involve actual matching. It will "
             "only build k-d tree and write it to a file so it can be used "
             "in future matches)");
+
+    /* Make sure that a HDU is also specified for the k-d tree when its an
+       external file. */
+    if( p->kdtreemode==MATCH_KDTREE_FILE && p->kdtreehdu==NULL )
+      error(EXIT_FAILURE, 0, "no '--kdtreehdu' specified! When a FITS "
+            "file is given to '--kdtree', you should specify its "
+            "extension/HDU within the file using '--kdtreehdu'. You "
+            "can either use the HDU name, or its number (counting "
+            "from 0). If you aren't sure what HDUs exist in the file, "
+            "you can use the 'astfits %s' command to see the full list",
+            p->kdtree);
   }
 }
 
@@ -271,10 +281,9 @@ ui_check_options_and_arguments(struct matchparams *p)
           /* Print a warning to let the user know the option will not be
              used (the user had probably confused things if they have given
              it). */
-          error(EXIT_SUCCESS, 0, "WARNING: with '--coord' you only need "
-                "to set '--ccol1' (for the table), '--ccol2' (for the "
-                "'--coord') is not needed because you are directly "
-                "giving the coordinates in any order you want");
+          error(EXIT_SUCCESS, 0, "WARNING: with '--coord' or '--kdtree', "
+                "you only need to set '--ccol1' (since there is only a "
+                "single input table), '--ccol2' is not needed");
         }
     }
 
@@ -767,6 +776,70 @@ ui_read_columns_to_double(struct matchparams *p, char 
*filename, char *hdu,
 
 
 
+#define UI_READ_KDTREE_FIXED_MSG "You can construct a k-d tree "        \
+  "for your first input table using the command below (just set your "  \
+  "desired output name, and give that file to '--kdtree', in a later "  \
+  "call with the same first input catalog):\n\n"                  \
+  "    astmatch %s -h%s --ccol1=%s,%s --kdtree=build --output=KDTREE.fits\n\n" 
\
+  "To learn more about the expected format of k-d trees in Gnuastro, "  \
+  "please run this command: 'info gnuastro k-d'"
+
+static void
+ui_read_kdtree(struct matchparams *p)
+{
+  size_t *sizetarr;
+  char **strarr1 = p->ccol1->array;
+  gal_data_t *keysll=gal_data_array_calloc(1);
+
+  /* Read the external k-d tree. */
+  p->kdtreedata=gal_table_read(p->kdtree, p->kdtreehdu, NULL,
+                               NULL, GAL_TABLE_SEARCH_NAME, 1,
+                               p->cp.minmapsize, p->cp.quietmmap, NULL);
+
+  /* It has to have only two columns, with an unsigned 32-bit integer
+     type in each. */
+  if( gal_list_data_number(p->kdtreedata)!=2 )
+    error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+          "'--kdtree') has %zu column(s), but should contain 2 "
+          "columns. " UI_READ_KDTREE_FIXED_MSG,
+          p->kdtree, p->kdtreehdu, gal_list_data_number(p->kdtreedata),
+          p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+  if( p->kdtreedata->type!=GAL_TYPE_UINT32
+      || p->kdtreedata->next->type!=GAL_TYPE_UINT32 )
+    error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+          "'--kdtree') doesn't have unsigned 32-bit integer columns. "
+          UI_READ_KDTREE_FIXED_MSG, p->kdtree, p->kdtreehdu,
+          p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+  if( p->kdtreedata->size!=p->cols1->size )
+    error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+          "'--kdtree') doesn't have the same number of rows as "
+          "%s, which is the first given input. "
+          UI_READ_KDTREE_FIXED_MSG, p->kdtree, p->kdtreehdu,
+          gal_fits_name_save_as_string(p->input1name, p->cp.hdu),
+          p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+
+  /* Read the k-d tree root. */
+  keysll[0].type=GAL_TYPE_SIZE_T;
+  keysll[0].name=MATCH_KDTREE_ROOT_KEY;
+  gal_fits_key_read(p->kdtree, p->kdtreehdu, keysll, 0, 0);
+  if(keysll[0].status)
+    error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+          "'--kdtree') doesn't have the '%s' keyword, or it "
+          "couldn't be read as a number. " UI_READ_KDTREE_FIXED_MSG,
+          p->kdtree, p->kdtreehdu, MATCH_KDTREE_ROOT_KEY,
+          p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+  sizetarr=keysll[0].array;
+  p->kdtreeroot=sizetarr[0];
+
+  /* Clean up: since the 'name' component wasn't allocated in this example,
+     we should set it to NULL before calling 'gal_data_array_free'. */
+  keysll[0].name=NULL;
+  gal_data_array_free(keysll, 1, 1);
+}
+
+
+
+
 
 /* Read catalog columns */
 static void
@@ -802,6 +875,26 @@ ui_read_columns(struct matchparams *p)
                : ui_read_columns_to_double(p, p->input2name, p->hdu2,
                                            cols2, ndim) );
 
+  /* If an external k-d tree is given, read it and make sure it has the
+     same number of rows as the first input and the proper datatype. */
+  if( p->kdtreemode==MATCH_KDTREE_FILE )
+    ui_read_kdtree(p);
+
+  /* If we are in k-d tree based matching and the second dataset is smaller
+     than the first, print a warning to let the user know that the speed
+     can be greatly improved if they swap the two. */
+  if( !p->cp.quiet
+      && p->kdtreemode!=MATCH_KDTREE_DISABLE
+      && p->cols1->size > p->cols2->size )
+    error(EXIT_SUCCESS, 0, "TIP: the matching speed will GREATLY IMPROVE "
+          "if you swap the two inputs. Currently the second input has "
+          "fewer rows than the first. In the k-d tree based matching, "
+          "multi-threading will occur on the second input's rows and "
+          "the k-d will be constructed for the first input. Fewer "
+          "first-input rows therefore means a smaller tree, and more "
+          "second-input rows will be better distributed in your "
+          "system's CPU");
+
   /* Free the extra spaces. */
   gal_list_str_free(cols1, 1);
   gal_list_str_free(cols2, 1);
@@ -1058,6 +1151,7 @@ ui_preparations(struct matchparams *p)
 void
 ui_read_check_inputs_setup(int argc, char *argv[], struct matchparams *p)
 {
+  size_t nthreads;
   struct gal_options_common_params *cp=&p->cp;
 
 
@@ -1110,7 +1204,32 @@ ui_read_check_inputs_setup(int argc, char *argv[], 
struct matchparams *p)
   /* If the output is a FITS table, prepare all the options as FITS
      keywords to write in output later. */
   if(gal_fits_name_is_fits(p->out1name))
-      gal_options_as_fits_keywords(&p->cp);
+    gal_options_as_fits_keywords(&p->cp);
+
+
+  /* Report the starting information. */
+  /* Let the user know that processing has started. */
+  if(!p->cp.quiet)
+    {
+      printf(PROGRAM_NAME" "PACKAGE_VERSION" started on %s",
+             ctime(&p->rawtime));
+      nthreads=p->kdtreemode==MATCH_KDTREE_DISABLE ? 1 : p->cp.numthreads;
+      printf("  - Using %zu CPU thread%s%s\n", nthreads,
+             nthreads==1 ? "." : "s.",
+             ( p->kdtreemode==MATCH_KDTREE_DISABLE
+               ? " (sort-based match only uses a single thread)" : ""));
+      printf("  - Match algorithm: %s\n",
+             p->kdtree ? "k-d tree" : "sort-based");
+      printf("  - Input-1: %s\n",
+             gal_fits_name_save_as_string(p->input1name, p->cp.hdu));
+      if(p->kdtreemode==MATCH_KDTREE_FILE)
+        printf("  - Input-1 k-d tree: %s\n",
+               gal_fits_name_save_as_string(p->kdtree, p->kdtreehdu));
+      if(p->kdtreemode!=MATCH_KDTREE_BUILD)
+        printf("  - Input-2: %s\n",
+               p->coord ? "from --coord"
+               : gal_fits_name_save_as_string(p->input2name, p->hdu2));
+    }
 }
 
 
@@ -1153,8 +1272,7 @@ ui_free_report(struct matchparams *p, struct timeval *t1)
   gal_list_str_free(p->bcols, 0);
   gal_list_str_free(p->stdinlines, 1);
 
-  /* Print the final message.
+  /* Print the final message. */
   if(!p->cp.quiet)
     gal_timing_report(t1, PROGRAM_NAME" finished in: ", 0);
-  */
 }
diff --git a/bin/match/ui.h b/bin/match/ui.h
index bf42f70..0f33a5d 100644
--- a/bin/match/ui.h
+++ b/bin/match/ui.h
@@ -58,6 +58,7 @@ enum option_keys_enum
      automatically). */
   UI_KEY_NOTMATCHED      = 1000,
   UI_KEY_OUTCOLS,
+  UI_KEY_KDTREEHDU,
 };
 
 
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index cb1accb..566de0e 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -576,6 +576,7 @@ Invoking MakeCatalog
 
 Match
 
+* Matching algorithms::
 * Invoking astmatch::           Inputs, outputs and options of Match
 
 Modeling and fitting
@@ -19951,10 +19952,65 @@ The nearest objects in the two catalogs, within the 
given aperture, will be foun
 The aperture can be a circle or an ellipse with any orientation.
 
 @menu
+* Matching algorithms::         Different ways to find the match
 * Invoking astmatch::           Inputs, outputs and options of Match
 @end menu
 
-@node Invoking astmatch,  , Match, Match
+@node Matching algorithms, Invoking astmatch, Match, Match
+@subsection Matching algorithms
+
+Matching involves two catalogs, let's call them catalog A (with N rows) and 
catalog B (with M rows).
+The most basic algorithm that immediately comes to mind is this:
+for each row in A (let's call it @mymath{A_i}), go over all the rows in B and 
calculate the distance to each and find the @mymath{B_j} that is nearest to 
@mymath{A_i}.
+If the distance between @mymath{A_i} and @mymath{B_j} is below a certain 
acceptable threshold (or radius), consider them as a match.
+
+You may think of this (initially genius!) solution: During the parsing, when I 
find that @mymath{A_i} and @mymath{B_j} satisfy the match above, I will remove 
@mymath{B_j} from the search of matches for @mymath{A_k} (where @mymath{k> i}).
+As a result, as I go down A and more matches are found, I have to calculate 
less distances (there are fewer elements in B that remain to be checked).
+However, this will introduce an important bias: @mymath{B_j} may actually be 
closer to @mymath{A_k}!
+But because @mymath{A_i} happened to be before @mymath{A_k} in your table, you 
removed @mymath{B_j} from the potential search domain of @mymath{A_k}.
+You will therefore loose that match, and replace it by a false match between 
@mymath{A_i} and @mymath{B_j}!
+
+In a single-dimentional match, this bias depends on the sorting of your two 
datasets (leading to different matches if you shuffle your datasets).
+But it will get more complex as you add dimensionality (for example catalogs 
dervied from 2D images or 3D cubes, where you have 2 and 3 different 
coordinates for each point).
+Here is how we adress this problem in Gnuastro's Match program: as we are 
doing the first parsing, we keep a log of all elements in A (@mymath{A_i} and 
@mymath{A_k} in the example above) that were within the radial threshold of 
@mymath{B_j}.
+Afterwards, for each @mymath{B_j} we can find the nearest element of A, 
irrespective of how the inputs were sorted, or their dimensionality.
+
+On the other hand, the basic parsing algorithm mentioned above is very 
computationally expensive:
+@mymath{N\times M} distances have to measured, and calculating the distance 
requires a square root and power of 2, which are not simple operations for the 
CPU.
+As a result, this basic algorithm will become terribly slow as your datasets 
grow in size.
+For example when N or M exceed hundreds of thousands (which is common in the 
current days with datasets like the European Space Agency's Gaia mission).
+Therefore that basic parsing algoritm won't work and we need to use more 
efficient ways to parse the two catalogs.
+Gnuastro's Match currently has two such algorithms:
+
+@table @asis
+@item Sort-based
+To avoid the problem of calculating @mymath{N\times M} Euclidean distances 
(that involve a square root), this method works like this:
+
+@enumerate
+@item
+Sort the two datasets by their first coordinate.
+Therefore @mymath{A_i<A_j} (only in first coordinate; when @mymath{i<j}), and 
similarly for the elements of B.
+@item
+Use the radial distance threshold to define a moving window of B over A.
+Therefore, within a single parsing of A, you can find all the elements in A 
that are sufficiently near every @mymath{B_j}.
+@item
+The nearest A within the subset that is nearest to @mymath{B_j} will be 
considered as the match.
+@end enumerate
+
+@item k-d tree based
+The k-d tree concept is much more abstract, but powerful (for example enabling 
parallel processing, that was not possible in the sort-based method above).
+In short a k-d tree is a partitioning of a k-dimentional space.
+For more on the k-d tree and Gnuastro's implementation of it, please see 
@ref{K-d tree}.
+
+To avoid getting into too much theory, let's look at it from a user's 
perspective: Match will internally construct a k-d tree for catalog A (the 
first catalog given to it).
+Optionally, you can also build the k-d tree of A with a separate call to Match 
(with @option{--kdtree=build} and @option{--output=mykdtree.fits}).
+This external k-d tree can be fed to Match later, when doing your matches (to 
avoid having to reconstruct it every time you want to match with the same first 
catalog; using @option{--kdtree=mykdtree.fits}).
+
+Each thread on your CPU will then be associated a list of rows from catalog B 
and the k-d tree is parsed independently for those to find the nearest element 
of A to that row in B.
+@end table
+
+
+@node Invoking astmatch,  , Matching algorithms, Match
 @subsection Invoking Match
 
 When given two catalogs, Match finds the rows that are nearest to each other 
within an input aperture.
@@ -29335,20 +29391,21 @@ have any type), see above for the definition of 
permutation.
 @cindex Matching
 @cindex Coordinate matching
 Matching is often necessary when two measurements of the same points have been 
done using different instruments (or hardware), different software or different 
configurations of the same software.
-In other words, you have two catalogs or tables and each has N columns 
containing the N-dimensional ``positional'' values of each point.
-Each can have other columns too, for example one can have brightness 
measurements in one filter, and another can have brightness measurements in 
another filter as well as morphology measurements or etc.
+In other words, you have two catalogs or tables, and each has N columns 
containing the N-dimensional ``coordinate'' values of each point.
+Each table can have other columns too, for example one can have brightness 
measurements in one filter, and another can have morphology measurements or etc.
 
-The matching functions here will use the positional columns to find the 
permutation necessary to apply to both tables.
-This will enable you to match by the positions, then apply the permutation to 
the brightness or morphology columns in the example above.
-The input and output data formats of the functions below are the some and 
described below before the actual functions.
+The matching functions here will use the coordinate columns of the two tables 
to find a permutation for each, and the total number of matched rows 
(@mymath{N_{match}}).
+This will enable you to match by the positions if you like.
+A a higher level, you can apply the permutation to the brightness or 
morphology columns to merge the catalogs over the @mymath{N_{match}} rows.
+The input and output data formats of the functions are the some and described 
below before the actual functions.
 Each function also has extra arguments due to the particular algorithm it uses 
for the matching.
 
 The two inputs of the functions (@code{coord1} and @code{coord2}) must be 
@ref{List of gal_data_t}.
-Each @code{gal_data_t} node in @code{coord1} or @code{coord2} should be a 
single dimensional dataset (column in a table) and all the nodes must have the 
same number of elements (rows).
+Each @code{gal_data_t} node in @code{coord1} or @code{coord2} should be a 
single dimensional dataset (column in a table) and all the nodes (in each) must 
have the same number of elements (rows).
 In other words, each column can be visualized as having the coordinates of 
each point in its respective dimension.
 The dimensions of the coordinates is determined by the number of 
@code{gal_data_t} nodes in the two input lists (which must be equal).
-The number of rows (or the number of elements in each @code{gal_data_t}) in 
the columns of @code{coord1} and @code{coord2} can be different.
-All these functions will all be satisfied if you use @code{gal_table_read} to 
read the two coordinate columns, see @ref{Table input output}.
+The number of rows (or the number of elements in each @code{gal_data_t}) in 
the columns of @code{coord1} and @code{coord2} can (and, usually will!) be 
different.
+In summary, these functions will be happy if you use @code{gal_table_read} to 
read the two coordinate columns from a file, see @ref{Table input output}.
 
 @cindex Permutation
 The functions below return a simply-linked list of three 1D datasets (see 
@ref{List of gal_data_t}), let's call the returned dataset @code{ret}.
@@ -29360,20 +29417,24 @@ If there wasn't any match, this function will return 
@code{NULL}.
 
 The two permutations can be applied to the rows of the two inputs: the first 
one (@code{ret}) should be applied to the rows of the table containing 
@code{coord1} and the second one (@code{ret->next}) to the table containing 
@code{coord2}.
 After applying the returned permutations to the inputs, the top 
@code{nummatched} elements of both will match with each other.
-The ordering of the rest of the elements is undefined (depends on the matching 
funciton used).
+The ordering of the rest of the elements is undefined (depends on the matching 
function used).
 The third node is the distances between the respective match (which may be 
elliptical distance, see discussion of ``aperture'' below).
 
 The functions will not simply return the nearest neighbor as a match.
-The nearest neighbor may be too far to be a meaningful.
-They will check the distance between the distance of the nearest neighbor of 
each point and only return a match for it it is within an acceptable 
N-dimensional distance (or ``aperture'').
+This is because the nearest neighbor may be too far to be a meaningful!
+They will check the distance between the nearest neighbor of each point and 
only return a match if it is within an acceptable N-dimensional distance (or 
``aperture'').
 The matching aperture is defined by the @code{aperture} array that is an input 
argument to the functions.
-If several points of one catalog lie within this aperture of a point in the 
other, the  nearest is defined as the match.
-In a 2D situation (where the input lists have two nodes), for the most generic 
case, it must have three elements: the major axis length, axis ratio and 
position angle (see @ref{Defining an ellipse and ellipsoid}).
+
+If several points of one catalog lie within this aperture of a point in the 
other catalog, the  nearest is defined as the match.
+In a 2D situation (where the input lists have two nodes), for the most generic 
case, @code{aperture} must have three elements: the major axis length, axis 
ratio and position angle (see @ref{Defining an ellipse and ellipsoid}).
 If @code{aperture[1]==1}, the aperture will be a circle of radius 
@code{aperture[0]} and the third value won't be used.
 When the aperture is an ellipse, distances between the points are also 
calculated in the respective elliptical distances (@mymath{r_{el}} in 
@ref{Defining an ellipse and ellipsoid}).
 
-
-
+@strong{Output permutations ignore internal sorting}: the output permutations 
will correspond to the initial inputs.
+Therefore, even when @code{inplace!=0} (and this function re-arranges the 
inputs in place), the output permutation will correspond to original (possibly 
non-sorted) inputs. The reason for this is that you rarely want to permute the 
actual positional columns after the match.
+Usually, you also have other columns (for example the brightness, morphology 
and etc) and you want to find how they differ between the objects that match.
+Once you have the permutations, they can be applied to those other columns 
(see @ref{Permutations}) and the higher-level processing can continue.
+So if you don't need the coordinate columns for the rest of your analysis, it 
is better to set @code{inplace=1}.
 
 @deftypefun {gal_data_t *} gal_match_sort_based (gal_data_t @code{*coord1}, 
gal_data_t @code{*coord2}, double @code{*aperture}, int @code{sorted_by_first}, 
int @code{inplace}, size_t @code{minmapsize}, int @code{quietmmap}, size_t 
@code{*nummatched})
 
@@ -29386,17 +29447,19 @@ When sorting is necessary and @code{inplace} is 
non-zero, the actual input colum
 Otherwise, an internal copy of the inputs will be made, used (sorted) and 
later freed before returning.
 Therefore, when @code{inplace==0}, inputs will remain untouched, but this 
function will take more time and memory.
 If internal allocation is necessary and the space is larger than 
@code{minmapsize}, the space will be not allocated in the RAM, but in a file, 
see description of @option{--minmapsize} and @code{--quietmmap} in 
@ref{Processing options}.
+@end deftypefun
 
-@cartouche
-@noindent
-@strong{Output permutations ignore internal sorting}: the output permutations 
will correspond to the initial inputs.
-Therefore, even when @code{inplace!=0} (and this function re-arranges the 
inputs in place), the output permutation will correspond to original (possibly 
non-sorted) inputs.
+@deftypefun {gal_data_t *} gal_match_kdtree (gal_data_t @code{*coord1}, 
gal_data_t @code{*coord2}, gal_data_t @code{*coord1_kdtree}, size_t 
@code{kdtree_root}, double @code{*aperture}, size_t @code{numthreads}, size_t 
@code{minmapsize}, int @code{quietmmap}, size_t @code{*nummatched})
+
+@cindex Matching by k-d tree
+@cindex k-d tree matching
+Use the k-d tree concept for finding matches between two catalogs, optionally 
in parallel (on @code{numthreads} threads).
+The k-d tree of the first input (@code{coord1_kdtree}), and its root index 
(@code{kdtree_root}), should be constructed and found before calling this 
function, to do this, you can use the @code{gal_kdtree_create} of @ref{K-d 
tree}.
+The desired @code{aperture} array is the same as @code{gal_match_sort_based} 
and discribed at the top of this section.
+
+The final number of matches is returned in @code{nummatched} and the format of 
the returned dataset (three columns) is described above.
+If internal allocation is necessary and the space is larger than 
@code{minmapsize}, the space will be not allocated in the RAM, but in a file, 
see description of @option{--minmapsize} and @code{--quietmmap} in 
@ref{Processing options}.
 
-The reason for this is that you rarely want to permute the actual positional 
columns after the match.
-Usually, you also have other columns (for example the brightness, morphology 
and etc) and you want to find how they differ between the objects that match.
-Once you have the permutations, they can be applied to those other columns 
(see @ref{Permutations}) and the higher-level processing can continue.
-So if you don't need the coordinate columns for the rest of your analysis, it 
is better to set @code{inplace=1}.
-@end cartouche
 @end deftypefun
 
 @node Statistical operations, Binary datasets, Matching, Gnuastro library
diff --git a/during-dev-test-data/kdtree-input.fits 
b/during-dev-test-data/kdtree-input.fits
deleted file mode 100644
index d3780eb..0000000
Binary files a/during-dev-test-data/kdtree-input.fits and /dev/null differ
diff --git a/during-dev-test-data/match-query.txt 
b/during-dev-test-data/match-query.txt
deleted file mode 100644
index 394bf51..0000000
--- a/during-dev-test-data/match-query.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-14.55495528995644      13.952740903340256      14.512879995960738      
-14.156180015003525     12.80845342418684       14.826212427784835      
-13.505267742341026     13.013443271481133      11.183241471767612      
-13.09715467421369      10.936844869289065      10.692733635319755      
-11.855001056969742     14.100516383294387      11.839440238220615      
-11.766844757741728     10.752701002494069      12.958678702895487      
-12.455862588902148     10.943892162638429      11.946732314414476      
-13.386374627834787     11.49741032889326       13.000603833689892      
-14.49239538261452      11.133665168041762      12.959051224814937      
-10.968887621940603     13.55609990177711       14.863758607257214      
diff --git a/during-dev-test-data/scripts/gen-inputs.sh 
b/during-dev-test-data/scripts/gen-inputs.sh
new file mode 100755
index 0000000..21cd7e1
--- /dev/null
+++ b/during-dev-test-data/scripts/gen-inputs.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# If you want to use a real catalog as reference (and manually add noise to
+# randomly selected rows, give a value to the 'real' variable). If this is
+# empty, a mock table with two columns of integers as input will be
+# created.
+real=~/tmp/gaia.fits
+real_c1=ra1
+real_c2=dec1
+
+# Number of points (number of inputs in mock, and number of rows in noised
+# catalog for real).
+num=300000
+
+# Scatter around each coordinate to add and aperture size for mock and real
+# tables.
+sigma_mock=1
+sigma_real=0.000555556 # 2 arcsecond (in degrees)
+
+# Build the base catalog input
+if [ x"$real" = x ]; then
+    sigma=$sigma_mock
+    for i in $(seq 1 $num); do
+        echo "$i $i"
+    done | asttable ../base.fits
+else
+    sigma=$sigma_real
+    asttable $real -c$real_c1,$real_c2 --output=../base.fits
+fi
+
+# Add noise to both columns to build the new dataset.
+asttable ../base.fits -c'arith $1 '$sigma' mknoise-sigma' \
+         -c'arith $2 '$sigma' mknoise-sigma' --rowrandom=$num \
+         -o../base-noised.fits



reply via email to

[Prev in Thread] Current Thread [Next in Thread]