[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[gnuastro-commits] master 1e98f7a: Table: two new options to limit rows:
From: |
Mohammad Akhlaghi |
Subject: |
[gnuastro-commits] master 1e98f7a: Table: two new options to limit rows: --rowlimit and --rowrandom |
Date: |
Fri, 5 Feb 2021 20:08:32 -0500 (EST) |
branch: master
commit 1e98f7a61eb61cf4b0fe38016b4f6712dc13af6b
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Table: two new options to limit rows: --rowlimit and --rowrandom
Until now, the only position-based row-selection options available to a
user where the '--head' and '--tail' options. But in some scenarios, we
want a certain range of rows based on their position in the full table (for
example rows 100 to 200). In other cases, to get a feeling of the dataset,
or for statistical tests, we want a random set of the rows. Until now, it
was not easy to extract these.
With this commit, the new '--rowlimit' option enables selecting a
contiguous range of rows anywhere within the table and the '--rowrandom'
enables selecting a random sub-set of the rows (after all value-based
selections have been applied).
---
NEWS | 8 +++
bin/table/args.h | 40 ++++++++++++
bin/table/main.h | 7 +++
bin/table/table.c | 180 ++++++++++++++++++++++++++++++++++++++++++++----------
bin/table/ui.c | 79 ++++++++++++++++++++++--
bin/table/ui.h | 3 +
doc/gnuastro.texi | 34 +++++++++++
lib/checkset.c | 2 +-
8 files changed, 314 insertions(+), 39 deletions(-)
diff --git a/NEWS b/NEWS
index e194a72..980d9eb 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,14 @@ See the end of the file for license conditions.
columns of the final output table. This is handy when you want a
"clean" (no NaN values in any column) table, but the table has many
columns.
+ --rowlimit: new option to specify the positional interval of rows to
+ show. Until now the '--head' or '--tail' options would just allow
+ seeing the first or last few rows. You can use this to view a
+ contiguous set of rows in the middle of the table.
+ --rowrandom: Make a random selection of the rows. This option is useful
+ when you have a large dataset and just want to see a random sub-set of
+ the rows. It takes an integer, selects that many rows from the input
+ randomly.
** Removed features
diff --git a/bin/table/args.h b/bin/table/args.h
index b66842e..bf5b13a 100644
--- a/bin/table/args.h
+++ b/bin/table/args.h
@@ -316,6 +316,46 @@ struct argp_option program_options[] =
GAL_OPTIONS_NOT_SET
},
{
+ "rowlimit",
+ UI_KEY_ROWLIMIT,
+ "INT,INT",
+ 0,
+ "Only rows in this row-counter range.",
+ UI_GROUP_OUTROWS,
+ &p->rowlimit,
+ GAL_TYPE_STRING,
+ GAL_OPTIONS_RANGE_GE_0,
+ GAL_OPTIONS_NOT_MANDATORY,
+ GAL_OPTIONS_NOT_SET,
+ gal_options_parse_csv_float64
+ },
+ {
+ "randomrows",
+ UI_KEY_ROWRANDOM,
+ "INT",
+ 0,
+ "Number of rows to select randomly.",
+ UI_GROUP_OUTROWS,
+ &p->rowrandom,
+ GAL_TYPE_SIZE_T,
+ GAL_OPTIONS_RANGE_GE_0,
+ GAL_OPTIONS_NOT_MANDATORY,
+ GAL_OPTIONS_NOT_SET,
+ },
+ {
+ "envseed",
+ UI_KEY_ENVSEED,
+ 0,
+ 0,
+ "Use GSL_RNG_SEED env. for '--randomrows'.",
+ UI_GROUP_OUTROWS,
+ &p->envseed,
+ GAL_OPTIONS_NO_ARG_TYPE,
+ GAL_OPTIONS_RANGE_0_OR_1,
+ GAL_OPTIONS_NOT_MANDATORY,
+ GAL_OPTIONS_NOT_SET
+ },
+ {
"noblank",
UI_KEY_NOBLANK,
"STR[,STR]",
diff --git a/bin/table/main.h b/bin/table/main.h
index c924d7b..dee83e2 100644
--- a/bin/table/main.h
+++ b/bin/table/main.h
@@ -24,6 +24,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#define MAIN_H
/* Include necessary headers */
+#include <gsl/gsl_rng.h>
#include <gnuastro/data.h>
#include <gnuastro-internal/options.h>
@@ -100,6 +101,9 @@ struct tableparams
uint8_t descending; /* Sort columns in descending order. */
size_t head; /* Output only the no. of top rows. */
size_t tail; /* Output only the no. of bottom rows. */
+ gal_data_t *rowlimit; /* Output rows in row-counter range. */
+ size_t rowrandom; /* Number of rows to show randomly. */
+ uint8_t envseed; /* Use the environment for random seed. */
gal_data_t *noblank; /* Remove rows that have blank. */
gal_list_str_t *catcolumnfile; /* Filename to concat column wise. */
gal_list_str_t *catcolumnhdu; /* HDU/extension for the catcolumn. */
@@ -123,6 +127,9 @@ struct tableparams
time_t rawtime; /* Starting time of the program. */
gal_data_t **colarray; /* Array of columns, with arithmetic. */
size_t numcolarray; /* Number of elements in 'colarray'. */
+ gsl_rng *rng; /* Main random number generator. */
+ const char *rng_name; /* Name of random number generator. */
+ unsigned long int rng_seed; /* Random number generator seed. */
/* For arithmetic operators. */
gal_list_str_t *wcstoimg_p; /* Pointer to the node. */
diff --git a/bin/table/table.c b/bin/table/table.c
index 60bbc2c..8cf4668 100644
--- a/bin/table/table.c
+++ b/bin/table/table.c
@@ -29,6 +29,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#include <stdlib.h>
#include <unistd.h>
+#include <gsl/gsl_rng.h>
#include <gsl/gsl_heapsort.h>
#include <gnuastro/txt.h>
@@ -338,7 +339,7 @@ table_selection_equal_or_notequal(struct tableparams *p,
gal_data_t *col,
static void
-table_selection(struct tableparams *p)
+table_select_by_value(struct tableparams *p)
{
uint8_t *u;
struct list_select *tmp;
@@ -559,47 +560,159 @@ table_sort(struct tableparams *p)
+/* Apply random row selection. If the returned value is 'EXIT_SUCCESS',
+ then, it was successful. Otherwise, it will return 'EXIT_FAILURE' and
+ the input won't be touched. */
+static int
+table_random_rows(gal_data_t *table, gsl_rng *rng, size_t numrandom,
+ size_t minmapsize, int quietmmap)
+{
+ int bad;
+ uint8_t *marr, *u, *uf;
+ gal_data_t *mask, *perm;
+ size_t i, b, g, *s, *sf, ind;
+
+ /* Sanity check. */
+ if(numrandom>table->size)
+ return EXIT_FAILURE;
+
+ /* Allocate space for the permutation array and the mask
+ array. Initialize the mask array to 1 (so we later set the rows we
+ want to 0). */
+ mask=gal_data_alloc(NULL, GAL_TYPE_UINT8, 1, table->dsize, NULL, 0,
+ minmapsize, quietmmap, NULL, NULL, NULL);
+ perm=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, 1, table->dsize, NULL, 0,
+ minmapsize, quietmmap, NULL, NULL, NULL);
+ uf=(u=mask->array)+mask->size; do *u++ = 1; while(u<uf);
+
+ /* Select the row numbers. */
+ marr=mask->array;
+ for(i=0;i<numrandom;++i)
+ {
+ /* Select a random index and make sure its new. */
+ bad=1;
+ while(bad)
+ {
+ ind = gsl_rng_uniform(rng) * table->size;
+ if(marr[ind]) bad=0;
+ }
+ marr[ind]=0;
+ }
+
+ /* Fill up the rest of the permutation array. */
+ g=0; /* Good indexs (starting from 0). */
+ b=numrandom; /* Bad indexs (starting from total number of good). */
+ u=mask->array;
+ sf=(s=perm->array)+perm->size;
+ do *s = *u++ ? b++ : g++; while(++s<sf);
+
+ /* Apply the final permutation to the whole table. */
+ table_apply_permutation(table, perm->array, numrandom, 1);
+
+ /* Clean up and return. */
+ gal_data_free(mask);
+ gal_data_free(perm);
+ return EXIT_SUCCESS;
+}
+
+
+
+
+
static void
-table_head_tail(struct tableparams *p)
+table_select_by_position(struct tableparams *p)
{
char **strarr;
gal_data_t *col;
size_t i, start, end;
+ double *darr = p->rowlimit ? p->rowlimit->array : NULL;
+
+ /* Random row selection (by position, not value). This step is
+ independent of the other operations of this function, so as soon as
+ its finished return. */
+ if(p->rowrandom)
+ {
+ if( table_random_rows(p->table, p->rng, p->rowrandom,
+ p->cp.minmapsize, p->cp.quietmmap)
+ == EXIT_FAILURE && p->cp.quiet==0 )
+ error(EXIT_SUCCESS, 0, "'--rowrandom' not activated because "
+ "the number of rows in the table at this stage (%zu) "
+ "is smaller than the number of requested random rows "
+ "(%zu). You can supress this message with '--quiet'",
+ p->table->size, p->rowrandom);
+ return;
+ }
+
+ /* Sanity check */
+ if(p->rowlimit)
+ {
+ if(darr[0]>=p->table->size)
+ error(EXIT_FAILURE, 0, "the first value to '--rowlimit' (%g) "
+ "is larger than the number of rows (%zu)",
+ darr[0]+1, p->table->size);
+ else if( darr[1]>=p->table->size )
+ error(EXIT_FAILURE, 0, "the second value to '--rowlimit' (%g) "
+ "is larger than the number of rows (%zu)",
+ darr[1]+1, p->table->size);
+ }
/* Go over all the columns and make the necessary corrections. */
for(col=p->table;col!=NULL;col=col->next)
{
- /* If we are dealing with strings, we'll need to free the strings
- that the columns that will not be used point to (outside the
- allocated array directly 'gal_data_t'). We don't have to worry
- about the space for the actual pointers (they will be free'd by
- 'free' in any case, since they are in the initially allocated
- array).*/
+ /* FOR STRING: we'll need to free the individual strings that will
+ not be used (outside the allocated array directly
+ 'gal_data_t'). We don't have to worry about the space for the
+ actual pointers (they will be free'd by 'free' in any case, since
+ they are in the initially allocated array).*/
if(col->type==GAL_TYPE_STRING)
{
- /* Set the start and ending indexs. */
- start = p->head!=GAL_BLANK_SIZE_T ? p->head : 0;
- end = p->head!=GAL_BLANK_SIZE_T ? p->table->size : p->tail;
-
- /* Free their allocated spaces. */
+ /* Parse the rows and free extra pointers. */
strarr=col->array;
- for(i=start; i<end; ++i) { free(strarr[i]); strarr[i]=NULL; }
+ if(p->rowlimit)
+ {
+ /* Note that the given values to '--rowlimit' started from 1,
+ but in 'ui.c' we subtracted one from it (so at this stage,
+ it starts from 0). */
+ start = darr[0];
+ end = darr[1];
+ for(i=0;i<p->table->size;++i)
+ if(i<start || i>end) { free(strarr[i]); strarr[i]=NULL; }
+ }
+ else
+ {
+ /* Free their allocated spaces. */
+ start = p->head!=GAL_BLANK_SIZE_T ? p->head : 0;
+ end = p->head!=GAL_BLANK_SIZE_T ? p->table->size : p->tail;
+ for(i=start; i<end; ++i) { free(strarr[i]); strarr[i]=NULL; }
+ }
}
- /* For '--tail', we'll need to bring the last columns to the
- start. Note that we are using 'memmove' because we want to be safe
- with overlap. */
- if(p->tail!=GAL_BLANK_SIZE_T)
- memmove(col->array,
- gal_pointer_increment(col->array, col->size - p->tail,
- col->type),
- p->tail*gal_type_sizeof(col->type));
-
- /* In any case (head or tail), the new number of column elements is
- the given value. */
- col->size = col->dsize[0] = ( p->head!=GAL_BLANK_SIZE_T
- ? p->head
- : p->tail );
+ /* Make the final adjustment. */
+ if(p->rowlimit)
+ {
+ /* Move the values up to the top and correct the size. */
+ col->size=darr[1]-darr[0]+1;
+ memmove(col->array,
+ gal_pointer_increment(col->array, darr[0], col->type),
+ (darr[1]-darr[0]+1)*gal_type_sizeof(col->type));
+ }
+ else
+ {
+ /* For '--tail', we'll need to bring the last columns to the
+ start. Note that we are using 'memmove' because we want to be
+ safe with overlap. */
+ if(p->tail!=GAL_BLANK_SIZE_T)
+ memmove(col->array,
+ gal_pointer_increment(col->array, col->size - p->tail,
+ col->type),
+ p->tail*gal_type_sizeof(col->type));
+
+ /* In any case (head or tail), the new number of column elements
+ is the given value. */
+ col->size = col->dsize[0] = ( p->head!=GAL_BLANK_SIZE_T
+ ? p->head
+ : p->tail );
+ }
}
}
@@ -853,15 +966,18 @@ table_noblank(struct tableparams *p)
void
table(struct tableparams *p)
{
- /* Apply a certain range (if required) to the output sample. */
- if(p->selection) table_selection(p);
+ /* Apply ranges based on row values (if required). */
+ if(p->selection) table_select_by_value(p);
/* Sort it (if required). */
if(p->sort) table_sort(p);
/* If the output number of rows is limited, apply them. */
- if(p->head!=GAL_BLANK_SIZE_T || p->tail!=GAL_BLANK_SIZE_T)
- table_head_tail(p);
+ if( p->rowlimit
+ || p->rowrandom
+ || p->head!=GAL_BLANK_SIZE_T
+ || p->tail!=GAL_BLANK_SIZE_T )
+ table_select_by_position(p);
/* If any operations are needed, do them. */
if(p->outcols)
diff --git a/bin/table/ui.c b/bin/table/ui.c
index 378f0db..bf173c3 100644
--- a/bin/table/ui.c
+++ b/bin/table/ui.c
@@ -271,11 +271,15 @@ ui_read_check_only_options(struct tableparams *p)
"v1x,v1y:v2x,v2y:v3x,v3y:...");
}
-
- /* Make sure '--head' and '--tail' aren't given together. */
- if(p->head!=GAL_BLANK_SIZE_T && p->tail!=GAL_BLANK_SIZE_T)
- error(EXIT_FAILURE, 0, "'--head' and '--tail' options cannot be "
- "called together");
+ /* Make sure only one of the positional row selection operations is
+ called in this run.*/
+ if(p->rowlimit
+ && p->rowrandom
+ && p->head!=GAL_BLANK_SIZE_T
+ && p->tail!=GAL_BLANK_SIZE_T)
+ error(EXIT_FAILURE, 0, "only one of the following options can be "
+ "called in one run: '--head', '--tail', '--rowlimit' and "
+ "'--rowrandom'");
/* If '--colmetadata' is given, make sure none of the given options have
more than three values. */
@@ -1042,7 +1046,8 @@ ui_check_select_sort_after(struct tableparams *p, size_t
nselect,
static void
ui_preparations(struct tableparams *p)
{
- size_t *colmatch;
+ double *darr;
+ size_t i, *colmatch;
gal_list_str_t *lines;
size_t nselect=0, origoutncols=0;
size_t sortindout=GAL_BLANK_SIZE_T;
@@ -1132,6 +1137,51 @@ ui_preparations(struct tableparams *p)
? p->table->size
: p->tail );
+ /* If rows are given, do some sanity checks and make sure that they are
+ within the table's limits. */
+ if(p->rowlimit)
+ {
+ /* There should only be two values. */
+ if(p->rowlimit->size!=2)
+ error(EXIT_FAILURE, 0, "only two should be given to "
+ "'--rowlimit' (the top and bottom row numbers specifying "
+ "your desired range)");
+
+ /* Do individual checks. */
+ darr=p->rowlimit->array;
+ for(i=0;i<p->rowlimit->size;++i)
+ {
+ /* Make sure it isn't 0 or negative. */
+ if( darr[i]<=0 )
+ error(EXIT_FAILURE, 0, "%g (value given to '--rowlimit') "
+ "is smaller than, or equal to, zero! This option's "
+ "values are row-counters (starting from 1), so they "
+ "must be positive integers", darr[i]);
+
+ /* Make sure its an integer. */
+ if( darr[i] != (size_t)(darr[i]) )
+ error(EXIT_FAILURE, 0, "%g (value given to '--rowlimit') is "
+ "not an integer! This option's values are row-counters "
+ "so they must be integers.", darr[i]);
+
+ /* Subtract 1 from the value, so it counts from 0. */
+ --darr[i];
+ }
+
+ /* Make sure that the first value is smaller than the second. */
+ if( darr[0] > darr[1] )
+ error(EXIT_FAILURE, 0, "the first value to '--rowlimit' (%g) is "
+ "larger than the second (%g). This option's values defines "
+ "a row-counter interval, assuming the first value is the top "
+ "of the desired interval (smaller row counter) and the second "
+ "value is the bottom of the desired interval (larger row "
+ "counter)", darr[0], darr[1]);
+ }
+
+ /* If random rows are desired, we need to define a GSL random number
+ generator structure. */
+ if(p->rowrandom)
+ p->rng=gal_checkset_gsl_rng(p->envseed, &p->rng_name, &p->rng_seed);
/* Clean up. */
free(colmatch);
@@ -1211,6 +1261,19 @@ ui_read_check_inputs_setup(int argc, char *argv[],
struct tableparams *p)
/* Read/allocate all the necessary starting arrays. */
ui_preparations(p);
+
+ /* Let the user know basic information if necessary (for example when a
+ random number generator has been used). */
+ if(p->rng && !p->cp.quiet)
+ {
+ /* Write the information. */
+ printf(PROGRAM_NAME" "PACKAGE_VERSION" started on %s",
+ ctime(&p->rawtime));
+ printf("Parameters used for '--randomrows':\n");
+ printf(" - Random number generator name: %s\n", p->rng_name);
+ printf(" - Random number generator seed: %lu\n", p->rng_seed);
+ printf("(use '--quiet' to supress this starting message)\n");
+ }
}
@@ -1245,4 +1308,8 @@ ui_free_report(struct tableparams *p)
gal_list_data_free(p->table);
gal_list_data_free(p->colmetadata);
if(p->colarray) free(p->colarray);
+
+ /* If a random number generator was allocated, free it. */
+ if(p->rng) gsl_rng_free(p->rng);
+
}
diff --git a/bin/table/ui.h b/bin/table/ui.h
index 5b29de3..a898b19 100644
--- a/bin/table/ui.h
+++ b/bin/table/ui.h
@@ -68,6 +68,9 @@ enum option_keys_enum
/* Only with long version (start with a value 1000, the rest will be set
automatically). */
UI_KEY_POLYGON = 1000,
+ UI_KEY_ENVSEED,
+ UI_KEY_ROWLIMIT,
+ UI_KEY_ROWRANDOM,
UI_KEY_INPOLYGON,
UI_KEY_OUTPOLYGON,
UI_KEY_CATCOLUMNRAWNAME,
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index 1680de4..757e742 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -10482,6 +10482,7 @@ When called with @option{--sort}, rows will be sorted
in descending order.
Only print the given number of rows from the @emph{top} of the final table.
Note that this option only affects the @emph{output} table.
For example if you use @option{--sort}, or @option{--range}, the printed rows
are the first @emph{after} applying the sort sorting, or selecting a range of
the full input.
+This option cannot be called with @option{--tail}, @option{--rowlimit} or
@option{--rowrandom}.
@cindex GNU Coreutils
If the given value to @option{--head} is 0, the output columns won't have any
rows and if its larger than the number of rows in the input table, all the rows
are printed (this option is effectively ignored).
@@ -10491,6 +10492,39 @@ This behavior is taken from the @command{head} program
in GNU Coreutils.
@itemx --tail=INT
Only print the given number of rows from the @emph{bottom} of the final table.
See @option{--head} for more.
+This option cannot be called with @option{--head}, @option{--rowlimit} or
@option{--rowrandom}.
+
+@item --rowlimit=INT,INT
+Only return the rows within the requested positional range (inclusive on both
sides).
+Therefore, @code{--rowlimit=5,7} will return 3 of the input rows, row 5, 6 and
7.
+This option will abort if any of the given values is larger than the total
number of rows in the table.
+
+With the @option{--head} or @option{--tail} options you can only see the top
or bottom few rows.
+However, with this option, you can limit the returned rows to a contiugous set
of rows in the middle of the table.
+Therefore this option cannot be called with @option{--head}, @option{--tail},
or @option{--rowrandom}.
+
+@item --rowrandom=INT
+@cindex Random row selection
+@cindex Row selection, by random
+Select @code{INT} rows from the input table by random (assuming a uniform
distribution).
+This option is applied @emph{after} the value-based selection options (like
@option{--sort}, @option{--range}, @option{--polygon} and etc).
+On the other hand, only the row counters are randomly selected, this option
doesn't change the order.
+Therefore, if @option{--rowrandom} is called together with @option{--sort},
the returned rows are still sorted.
+This option cannot be called with @option{--head}, @option{--tail}, or
@option{--rowlimit}.
+
+This option will only have an effect if @code{INT} is larger than the number
of rows when it is activated (after the value-based selection options have been
applied).
+When there are fewer rows, a warning is printed, saying that this option has
no effect.
+The warning can be disabled with the @option{--quiet} option.
+
+@cindex Reproducibility
+Due to its nature (to be random), the output of this option differs in each
run.
+Therefore 5 calls to Table with @option{--rowrandom} on the same input table
will generate 5 different outputs.
+If you want a reproducible random selection, set the @code{GSL_RNG_SEED}
environment variable and also use the @option{--envseed} option, for more see
@ref{Generating random numbers}.
+
+@item --envseed
+Read the random number generator seed from the @code{GSL_RNG_SEED} environment
variable for @option{--rowrandom} (instead of generating a different seed
internally on every run).
+This is useful if you want a reproducible random selection of the input rows.
+For more, see @ref{Generating random numbers}.
@item -b STR[,STR[,STR]]
@itemx --noblank=STR[,STR[,STR]]
diff --git a/lib/checkset.c b/lib/checkset.c
index 4247938..e4ddd04 100644
--- a/lib/checkset.c
+++ b/lib/checkset.c
@@ -49,7 +49,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
/**************************************************************/
/* The GSL random number generator (RNG) reads values from the
environment. This function is designed to make the job easier for any
- program using GSL's RNG. If the user doesn't want to set the */
+ Gnuastro program using GSL's RNG functions. */
gsl_rng *
gal_checkset_gsl_rng(uint8_t envseed_bool, const char **name,
unsigned long int *seed)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [gnuastro-commits] master 1e98f7a: Table: two new options to limit rows: --rowlimit and --rowrandom,
Mohammad Akhlaghi <=