gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r2436 - in Extractor/src/plugins: . pdf


From: grothoff
Subject: [GNUnet-SVN] r2436 - in Extractor/src/plugins: . pdf
Date: Thu, 5 Jan 2006 16:26:45 -0800 (PST)

Author: grothoff
Date: 2006-01-05 16:26:43 -0800 (Thu, 05 Jan 2006)
New Revision: 2436

Modified:
   Extractor/src/plugins/pdf/Stream.cc
   Extractor/src/plugins/tarextractor.c
Log:
Hello Christian,

As I promised I attach a rewritten 'tarextractor.c' for
libextractor-0.5.9 (also works with previous versions).
Given the extent of changes I believe it is just as good to send the
full text rather than a diff.

Keywords 'date' and 'format' are extracted. More checksums variants
were added. Long filenames as produced by GNU and Schilling tar
(possibly Solaris pax also) are extracted.

I now use "application/x-tar" uniformly as a mimetype for all variants
of tar. It makes more sense IMHO than mapping both very old V7 and very
recent POSIX formats to it whereas the intermediate versions are mapped
to even more exotic and confusing values such as
"application/x-ustar" (for POSIX 1988) or "application/x-gtar" for GNU
tar archives (which exist in different variants) or (maybe)
"application/x-xstar" for Schilling's tar. The format keyword is now
here to help tell one variant from another. Recent
GNU and Schilling tar can unpack nearly every known variant anyway.

In some cases filenames are extracted as UTF-8 strings (above ASCII
range), in other cases as raw 8-bit octet strings (ISO-8859-15 by
default here). Somehow libextractor correctly takes care of both cases,
(using UTF-8 prefix octets maybe ? I have yet to read the code).
I haven't tried to outsmart it.

Cordially,
Ronan


Modified: Extractor/src/plugins/pdf/Stream.cc
===================================================================
--- Extractor/src/plugins/pdf/Stream.cc 2006-01-06 00:26:04 UTC (rev 2435)
+++ Extractor/src/plugins/pdf/Stream.cc 2006-01-06 00:26:43 UTC (rev 2436)
@@ -420,7 +420,16 @@
   predLine = NULL;
   ok = gFalse;
 
+  if (width <= 0 || nComps <= 0 || nBits <= 0 ||
+      nComps >= INT_MAX/nBits ||
+      width >= INT_MAX/nComps/nBits) {
+    return;
+  }
   nVals = width * nComps;
+  if (nVals + 7 <= 0) {
+    return;
+  }
+
   totalBits = nVals * nBits;
   if (totalBits == 0 ||
        (totalBits / nBits) / nComps != width ||
@@ -1276,16 +1285,15 @@
 CCITTFaxStream::CCITTFaxStream(Stream *strA, int encodingA, GBool endOfLineA,
                               GBool byteAlignA, int columnsA, int rowsA,
                               GBool endOfBlockA, GBool blackA):
-    FilterStream(strA) {
+  FilterStream(strA) {
   encoding = encodingA;
   endOfLine = endOfLineA;
   byteAlign = byteAlignA;
   columns = columnsA;
+
   rows = rowsA;
   endOfBlock = endOfBlockA;
   black = blackA;
-  refLine = (short *)gmalloc((columns + 3) * sizeof(short));
-  codingLine = (short *)gmalloc((columns + 2) * sizeof(short));
 
   eof = gFalse;
   row = 0;
@@ -1294,8 +1302,14 @@
   codingLine[0] = 0;
   codingLine[1] = refLine[2] = columns;
   a0 = 1;
-
   buf = EOF;
+  if (columns + 4 < 1 || (columns + 4) >= INT_MAX / sizeof(short)) {
+    /* illegal value, bail out */
+    eof = gTrue;
+    columns = 0;
+  }
+  refLine = (short *)gmalloc((columns + 3) * sizeof(short));
+  codingLine = (short *)gmalloc((columns + 2) * sizeof(short));  
 }
 
 CCITTFaxStream::~CCITTFaxStream() {
@@ -2974,6 +2988,7 @@
   length = read16() - 2;
   scanInfo.numComps = str->getChar();
   if (scanInfo.numComps <= 0 || scanInfo.numComps > 4) {
+    scanInfo.numComps = 0;
     error(getPos(), "Bad number of components in DCT stream");
     return gFalse;
   }
@@ -3051,12 +3066,12 @@
   while (length > 0) {
     index = str->getChar();
     --length;
-    if ((index & 0x0f) >= 4) {
+    if (((index & 0x0f) >= 4) || ((index & ~0x10) < 0)) {
       error(getPos(), "Bad DCT Huffman table");
       return gFalse;
     }
     if (index & 0x10) {
-      index &= 0x0f;
+      index &= 0x03;
       if (index >= numACHuffTables)
        numACHuffTables = index+1;
       tbl = &acHuffTables[index];

Modified: Extractor/src/plugins/tarextractor.c
===================================================================
--- Extractor/src/plugins/tarextractor.c        2006-01-06 00:26:04 UTC (rev 
2435)
+++ Extractor/src/plugins/tarextractor.c        2006-01-06 00:26:43 UTC (rev 
2436)
@@ -23,14 +23,16 @@
 
 /*
  * Note that this code is not complete!
- * It will not report correct results for very long member filenames
- * (> 99 octets) when the archive was made with GNU tar or Solaris tar.
  *
  * References:
+ *
  * http://www.mkssoftware.com/docs/man4/tar.4.asp
  * (does document USTAR format common nowadays,
  *  but not other extended formats such as the one produced
  *  by GNU tar 1.13 when very long filenames are met.)
+ *
+ * http://gd.tuwien.ac.at/utils/archivers/star/README.otherbugs
+ * (J. Schilling's remarks on TAR formats compatibility issues.)
  */
 
 static EXTRACTOR_KeywordList * addKeyword(EXTRACTOR_KeywordType type,
@@ -39,7 +41,7 @@
   EXTRACTOR_KeywordList * result = next;
 
   if (NULL != keyword) {
-    if (0 == strlen(keyword)) {
+    if (0 == *keyword) {
       free(keyword);
     } else {
       result = malloc(sizeof(EXTRACTOR_KeywordList));
@@ -59,81 +61,129 @@
 static EXTRACTOR_KeywordList * appendKeyword(EXTRACTOR_KeywordType type,
                                             char * keyword,
                                             EXTRACTOR_KeywordList * last) {
-  EXTRACTOR_KeywordList * result;
+  EXTRACTOR_KeywordList * result = last;
 
-  if ( (last != NULL) &&
-       (last->next != NULL) )
-    abort();
-  if (keyword == NULL)
-    return last;
-  if (strlen(keyword) == 0) {
-    free(keyword);
-    return last;
+  if (NULL != keyword) {
+    if (0 == *keyword) {
+      free(keyword);
+    } else {
+      if ( (NULL != last) && (NULL != last->next) )
+        abort();
+      result = malloc(sizeof(EXTRACTOR_KeywordList));
+      if(NULL == result) {
+        free(keyword);
+      } else {
+        result->next = NULL;
+        result->keywordType = type;
+        result->keyword = keyword;
+        if (NULL != last)
+          last->next = result;
+      }
+    }
   }
-  result = malloc(sizeof(EXTRACTOR_KeywordList));
-  result->next = NULL;
-  result->keywordType = type;
-  result->keyword = keyword;
-  if (last != NULL)
-    last->next = result;
+
   return result;
 }
 
+/*
+ * Define known TAR archive member variants.
+ * In theory different variants
+ * can coexist within a single TAR archive file
+ * although this will be uncommon.
+ */
+#define TAR_V7ORIGINAL_FORMAT    (1)
+#define TAR_V7EXTENDED_FORMAT    (1 << 1)
+#define TAR_SCHILLING1985_FORMAT (1 << 2)
+#define TAR_POSIX1988_FORMAT     (1 << 3)
+#define TAR_GNU1991_FORMAT       (1 << 4)
+#define TAR_SCHILLING1994_FORMAT (1 << 5)
+#define TAR_GNU1997_FORMAT       (1 << 6)
+#define TAR_POSIX2001_FORMAT     (1 << 7)
+#define TAR_SCHILLING2001_FORMAT (1 << 8)
+#define TAR_SOLARIS2001_FORMAT   (1 << 9)
+#define TAR_GNU2004_FORMAT       (1 << 10)
+
+/*
+ * TAR header structure, modelled after POSIX.1-1988
+ */
 typedef struct {
-  char name[100];
+  char fileName[100];
   char mode[8];
   char userId[8];
   char groupId[8];
-  char filesize[12];
+  char fileSize[12];
   char lastModTime [12];
   char chksum[8];
   char link;
   char linkName[100];
-} TarHeader;
-
-typedef struct {
-  TarHeader tar;
-  char magic[6];
+  /*
+   * All fields below are a
+   * either zero-filled or undefined
+   * for UNIX V7 TAR archive members ;
+   * their header is always 512 octets long nevertheless.
+   */
+  char ustarMagic[6];
   char version[2];
-  char uname[32];
-  char gname[32];
-  char devmajor[8];
-  char devminor [8];
+  char userName[32];
+  char groupName[32];
+  char devMajor[8];
+  char devMinor[8];
   char prefix[155];
-} USTarHeader;
+  char filler[12];
+} TarHeader;
 
-static unsigned
-taroctalvalue(const char *data,
-              size_t size,
-              unsigned long long *valueptr)
+#define TAR_HEADER_SIZE (sizeof(TarHeader))
+#define TAR_TIME_FENCE  ((long long) (-(1LL << 62)))
+
+static size_t
+tar_roundup(size_t size)
 {
-   unsigned result = 0;
+  size_t diff = (size % TAR_HEADER_SIZE);
 
-   if(NULL != data && 0 < size)
-   {
+  return (0 == diff) ? size : (size + (TAR_HEADER_SIZE - diff));
+}
+
+static int
+tar_isnonzero(const char *data,
+              unsigned int length)
+{
+  unsigned int total = 0;
+
+  while(total < length) {
+    if(0 != data[total])
+      return 1;
+    total++;
+  }
+  
+  return 0;
+}
+
+static unsigned int
+tar_octalvalue(const char *data,
+               size_t size,
+               unsigned long long *valueptr)
+{
+   unsigned int result = 0;
+
+   if(NULL != data && 0 < size) {
      const char *p = data;
      int found = 0;
      unsigned long long value = 0;
 
-     while( (p < data + size) && (' ' == *p))
+     while( (p < data + size) && (' ' == *p) )
        p += 1;
 
-     while( (p < data + size) && ('0' <= *p) && (*p < '8') )
-     {
+     while( (p < data + size) && ('0' <= *p) && (*p < '8') ) {
        found = 1;
        value *= 8;
        value += (*p - '0');
        p += 1;
      }
 
-     if(0 != found)
-     {
-       while( (p < data + size) && (' ' == *p) )
+     if(0 != found) {
+       while( (p < data + size) && ((0 == *p) || (' ' == *p)) )
          p += 1;
 
-       while( (p < data + size) && (0 == *p) )
-         p += 1;
-
        result = (p - data);
      }
 
@@ -144,137 +194,619 @@
    return result;
 }
 
+static int
+tar_time(long long timeval,
+         char *rtime,
+         unsigned int rsize)
+{
+  int retval = 0;
 
+  /*
+   * shift epoch to proleptic times
+   * to make subsequent modulo operations safer.
+   */
+  long long my_timeval = timeval
+                       + ((long long) ((1970 * 365) + 478)
+                        * (long long) 86400);
+
+  unsigned int seconds = (unsigned int) (my_timeval % 60);
+  unsigned int minutes = (unsigned int) ((my_timeval / 60) % 60);
+  unsigned int hours   = (unsigned int) ((my_timeval / 3600) % 24);
+
+  unsigned int year    = 0;
+  unsigned int month   = 1;
+
+  unsigned int days    = (unsigned int) (my_timeval / (24 * 3600));
+
+  unsigned int days_in_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 
31};
+  unsigned int diff = 0;
+
+  if ((long long)0 > my_timeval)
+    return EDOM;
+
+  /*
+   * 400-year periods
+   */
+  year += ( 400 * (days / ((365 * 400) + 97)) );
+  days %= ((365 * 400) + 97);
+
+  /*
+   * 100-year periods
+   */
+  diff  = (days / ((365 * 100) + 24));
+  if(4 <= diff) {
+     year += 399;
+     days = 364;
+  }
+  else {
+     year += (100 * diff);
+     days %= ((365 * 100) + 24);
+  }
+
+  /*
+   * remaining leap years
+   */
+  year += (4 * (days / ((365 * 4) + 1)) );
+  days %= ((365 * 4) + 1);
+
+  while(1) {
+    if( (0 == (year % 400)) || ((0 == (year % 4)) && (0 != (year % 100))) ) {
+      if(366 > days) {
+        break;
+      } else {
+        days -= 366;
+        year++;
+      }
+    }
+    else {
+      if(365 > days) {
+        break;
+      } else {
+        days -= 365;
+        year++;
+      }
+    }
+  }
+
+  if( (0 == (year % 400)) || ((0 == (year % 4)) && (0 != (year % 100))) )
+    days_in_month[1] = 29;
+
+  for (month = 0; (month < 12) && (days >= days_in_month[month]); month += 1)
+    days -= days_in_month[month];
+
+  retval = snprintf(rtime, rsize, "%04u-%02u-%02uT%02u:%02u:%02uZ",
+                    year, month + 1, days + 1, hours, minutes, seconds);
+
+  return (retval < rsize) ? 0 : EOVERFLOW;
+}
+
 struct EXTRACTOR_Keywords *
 libextractor_tar_extract(const char * filename,
                         const char * data,
                         size_t size,
                         struct EXTRACTOR_Keywords * prev) {
-  const TarHeader * tar;
-  const USTarHeader * ustar;
-  size_t pos;
+  char *fname = NULL;
+  size_t pos = 0;
   int contents_are_empty = 1;
-  const char * mimetype = NULL;
+  long long maxftime = TAR_TIME_FENCE;
+  unsigned int format_archive = 0;
   struct EXTRACTOR_Keywords * last;
 
+  if (512 != TAR_HEADER_SIZE)
+    return prev; /* compiler should remove this when optimising */
+  if (0 != (size % TAR_HEADER_SIZE))
+    return prev; /* cannot be tar! */
+  if (size < TAR_HEADER_SIZE)
+    return prev; /* too short, or somehow truncated */
+
   last = prev;
   if (last != NULL)
     while (last->next != NULL)
       last = last->next;
 
-  if (0 != (size % 512) )
-    return prev; /* cannot be tar! */
-  if (size < 1024)
-    return prev; /* too short, or somehow truncated */
-
   pos = 0;
-  while (pos + sizeof(TarHeader) < size) {
+  while ((pos + TAR_HEADER_SIZE) <= size) {
+    const TarHeader * tar = NULL;
+    unsigned format_member = 0;
+    unsigned long long fmode;
     unsigned long long fsize;
+    long long ftime;
+    char typeFlag = -1;
     const char * nul_pos;
-    const char * ustar_prefix = NULL;
-    unsigned int ustar_prefix_length = 0;
-    unsigned int tar_name_length;
-    unsigned int zeropos;
-    int header_is_empty = 1;
+    unsigned int tar_prefix_length = 0;
+    unsigned int tar_name_length = 0;
+    unsigned int checksum_offset;
+    int checksum_computed_500s = 0;
+    int checksum_computed_512s = 0;
+    unsigned int checksum_computed_500u = 0;
+    unsigned int checksum_computed_512u = 0;
+    unsigned long long checksum_stored = 0;
 
-    if (pos + 1024 < size) {
-      const int * idata = (const int*) (data + pos);
-      for (zeropos = 0; zeropos < 1024 / sizeof(int); zeropos++) {
-       if(0 != idata[zeropos]) {
-         header_is_empty = 0;
-         break;
-       }
-      }
+    /*
+     * Compute TAR header checksum and compare with stored value.
+     * Allow for non-conformant checksums computed with signed values,
+     * such as those produced by early Solaris tar.
+     * Allow for non-conformant checksums computed on first 500 octets,
+     * such as those produced by SunOS 4.x tar according to J. Schilling.
+     * This will also detect EOF marks, since a zero-filled block
+     * cannot possibly hold octal values.
+     */
+    for (checksum_offset = 0; checksum_offset < 148; checksum_offset += 1) {
+      checksum_computed_500u += (unsigned char) data[pos + checksum_offset];
+      checksum_computed_500s += (signed char) data[pos + checksum_offset];
     }
+    if (8 > tar_octalvalue(data + pos + checksum_offset, 8, &checksum_stored))
+      break;
+    for (; checksum_offset < 156; checksum_offset += 1) {
+      checksum_computed_500u += (unsigned char) ' ';
+      checksum_computed_500s += (signed char) ' ';
+    }
+    for (; checksum_offset < 500; checksum_offset += 1) {
+      checksum_computed_500u += (unsigned char) data[pos + checksum_offset];
+      checksum_computed_500s += (signed char) data[pos + checksum_offset];
+    }
 
-    if (header_is_empty) /* assume the EOF mark was reached */
+    checksum_computed_512u = checksum_computed_500u;
+    checksum_computed_512s = checksum_computed_500s;
+    for (; checksum_offset < TAR_HEADER_SIZE; checksum_offset += 1) {
+      checksum_computed_512u += (unsigned char) data[pos + checksum_offset];
+      checksum_computed_512s += (signed char) data[pos + checksum_offset];
+    }
+
+    /*
+     * Suggestion: use signed checksum matches to refine
+     * TAR format detection.
+     */
+    if ( (checksum_stored != (unsigned long long) checksum_computed_512u)
+      && (checksum_stored != (unsigned long long) checksum_computed_512s)
+      && (checksum_stored != (unsigned long long) checksum_computed_500s)
+      && (checksum_stored != (unsigned long long) checksum_computed_500u) )
       break;
-
+ 
     tar = (const TarHeader*) &data[pos];
+    typeFlag = tar->link;
+    pos += TAR_HEADER_SIZE;
 
     /*
-     * checking all octal fields helps reduce
+     * Checking all octal fields helps reduce
      * the possibility of false positives ;
-     * only the file size is used for now.
+     * only the file size, time and mode are used for now.
+     *
+     * This will fail over GNU and Schilling TAR huge size fields
+     * using non-octal encodings used for very large file lengths (> 8 GB).
      */
-    if( (12 > taroctalvalue(tar->filesize, 12, &fsize))
-     || (12 > taroctalvalue(tar->lastModTime, 12, NULL))
-     || (8  > taroctalvalue(tar->mode, 8, NULL))
-     || (8  > taroctalvalue(tar->userId, 8, NULL))
-     || (8  > taroctalvalue(tar->groupId, 8, NULL)) )
+    if( (12 > tar_octalvalue(tar->fileSize, 12,
+                             &fsize))
+     || (12 > tar_octalvalue(tar->lastModTime, 12,
+                             (unsigned long long *) &ftime))
+     || (8  > tar_octalvalue(tar->mode, 8,
+                             (unsigned long long *) &fmode))
+     || (8  > tar_octalvalue(tar->userId, 8, NULL))
+     || (8  > tar_octalvalue(tar->groupId, 8, NULL)) )
       break;
 
-    /* fixme: we may want to check the header checksum here... */
-    /* fixme: we attempt to follow MKS document for long file names,
-       but no TAR file was found yet which matched what we understood ! */
-    if (pos + sizeof(USTarHeader) < size) {
+    /*
+     * Find out which TAR variant is here.
+     */
+    if(0 == memcmp(tar->ustarMagic, "ustar  ", 7)) {
 
-      nul_pos = memchr(data + pos, 0, sizeof tar->name);
-      tar_name_length = (0 == nul_pos)
-                     ? sizeof(tar->name)
-                      : (nul_pos - (data + pos));
+      if(' ' == tar->mode[6])
+        format_member = TAR_GNU1991_FORMAT;
+      else if( ('K' == typeFlag) || ('L' == typeFlag) ) {
+        format_member = TAR_GNU1997_FORMAT;
+        ftime = TAR_TIME_FENCE;
+      } else
+        format_member = ( ((unsigned) fmode) != (((unsigned) fmode) & 03777) )
+                      ? TAR_GNU1997_FORMAT : TAR_GNU2004_FORMAT;
 
-      ustar = (const USTarHeader*) &data[pos];
+    } else if (0 == memcmp(tar->ustarMagic, "ustar", 6)) {
 
-      if (NULL == mimetype) {
-        if(0 == memcmp(ustar->magic, "ustar  ", 7))
-          mimetype = "application/x-gtar";
-        else
-          mimetype = "application/x-tar";
+      /*
+       * It is important to perform test for SCHILLING1994 before GNU1997
+       * because certain extension type flags ('L' and 'S' for instance)
+       * are used by both.
+       */
+      if( (0 ==  tar->prefix[130])
+       && (12 <= tar_octalvalue(tar->prefix + 131, 12, NULL))
+       && (12 <= tar_octalvalue(tar->prefix + 143, 12, NULL))
+       && (0 == tar_isnonzero(tar->filler, 8))
+       && (0 == memcmp(tar->filler + 8, "tar", 4)) ) {
+
+        format_member = TAR_SCHILLING1994_FORMAT;
+
+      } else if ( ('D' == typeFlag) || ('K' == typeFlag)
+               || ('L' == typeFlag) || ('M' == typeFlag)
+               || ('N' == typeFlag) || ('S' == typeFlag)
+               || ('V' == typeFlag) ) {
+
+        format_member = TAR_GNU1997_FORMAT;
+
+      } else if ( ('g' == typeFlag)
+               || ('x' == typeFlag) || ('X' == typeFlag) ) {
+
+        format_member = TAR_POSIX2001_FORMAT;
+        ftime = TAR_TIME_FENCE;
+
+      } else {
+
+        format_member = TAR_POSIX1988_FORMAT;
+
       }
+    } else if ( (0 == memcmp(tar->filler + 8, "tar", 4))
+             && (0 == tar_isnonzero(tar->filler, 8)) ) {
 
-      if (0 == strncmp("ustar",
-                       &ustar->magic[0],
-                       strlen("ustar"))) {
-        if(0 != *ustar->prefix) {
-           nul_pos = memchr(ustar->prefix, 0, sizeof ustar->prefix);
+      format_member = TAR_SCHILLING1985_FORMAT;
 
-           ustar_prefix_length = (0 == nul_pos)
-                               ? sizeof ustar->prefix
-                               : nul_pos - ustar->prefix;
-           ustar_prefix = ustar->prefix;
+    } else if ( ('0' <= typeFlag) && (typeFlag <= '2') ) {
+
+      format_member = TAR_V7ORIGINAL_FORMAT;
+
+    } else {
+
+      format_member = TAR_V7EXTENDED_FORMAT;
+
+    }
+
+    /*
+     * Locate the file names.
+     */
+    if ( (0 != (format_member & TAR_POSIX2001_FORMAT)) 
+      && (('x' == typeFlag) || ('X' == typeFlag)) ) {
+
+      if(size <= pos)
+        break;
+
+      else if ( (8 <= fsize) && fsize <= (unsigned long long) (size - pos)) {
+        const char *keyptr  = data + pos;
+        const char *valptr  = NULL;
+        const char *nameptr = NULL;
+        unsigned int keylength = 0;
+        unsigned int namelength = 0;
+
+        while (keyptr < data + pos + (size_t) fsize) {
+          if( ('0' > *keyptr) || ('9' < *keyptr) ) {
+            keyptr += 1;
+            continue;
+          }
+
+          keylength = (unsigned int) strtoul(keyptr, (char **) &valptr, 10);
+          if( (0 < keylength) && (NULL != valptr) && (keyptr != valptr) ) {
+            unsigned int difflength = 0;
+
+            while( (valptr < data + pos + (size_t) fsize)
+                && (' ' == *valptr) )
+              valptr += 1;
+
+            difflength = (valptr - keyptr);
+
+            if (0 == memcmp(valptr, "path=", 5)) {
+              nameptr = valptr + 5;
+              namelength = keylength - (nameptr - keyptr);
+            } else {
+
+              if( (keylength > (valptr - keyptr) + 4 + 2)
+               && (0 == memcmp(valptr, "GNU.", 4)) )
+                format_archive |= TAR_GNU2004_FORMAT; 
+
+              else if( (keylength > (valptr - keyptr) + 7 + 2)
+               && (0 == memcmp(valptr, "SCHILY.", 7)) )
+                format_archive |= TAR_SCHILLING2001_FORMAT; 
+
+              else if( (keylength > (valptr - keyptr) + 4 + 2)
+               && (0 == memcmp(valptr, "SUN.", 4)) )
+                format_archive |= TAR_SOLARIS2001_FORMAT; 
+            }
+
+            keyptr += keylength;
+          } else {
+            nameptr = NULL;
+            break;
+          }
         }
+        
+        if ( (NULL != nameptr) && (0 != *nameptr)
+          && ((size - (nameptr - data)) >= namelength) && (1 < namelength) ) {
+          if (NULL != fname)
+            free(fname);
+          /*
+           * There is an 1-offset because POSIX.1-2001
+           * field separator is counted in field length.
+           */
+          fname = malloc(namelength);
+          if (NULL != fname) {
+            memcpy(fname, nameptr, namelength-1);
+            fname[namelength-1] = '\0';
+
+            pos += tar_roundup((size_t) fsize);
+            format_archive |= format_member;
+            continue;
+          }
+        }
       }
+    }
 
-      pos += 512; /* V7 Tar, USTar and GNU Tar usual headers take 512 octets */
+    else if ( (0 != (format_member
+                & (TAR_SCHILLING1994_FORMAT
+                  |TAR_GNU1997_FORMAT|TAR_GNU2004_FORMAT)))
+           && ('L' == typeFlag) ) {
+
+      if(size <= pos)
+        break;
+
+      else if ( (0 < fsize) && fsize <= (unsigned long long) (size - pos)) {
+
+        size_t length = (size_t) fsize;
+
+        nul_pos = memchr(data + pos, 0, length);
+        if (NULL != nul_pos)
+          length = (nul_pos - (data + pos));
+
+        if (0 < length) {
+          if (NULL != fname)
+            free(fname);
+          fname = malloc(1 + length);
+          if (NULL != fname) {
+            memcpy(fname, data + pos, length);
+            fname[length] = '\0';
+          }
+
+          pos += tar_roundup((size_t) fsize);
+          format_archive |= format_member;
+          continue;
+        }
+      }
     } else {
-      pos += 257; /* sizeof(TarHeader); minus gcc alignment... */
+
+      nul_pos = memchr(tar->fileName, 0, sizeof tar->fileName);
+      tar_name_length = (0 == nul_pos)
+                     ? sizeof(tar->fileName) : (nul_pos - tar->fileName);
+
+      if ( (0 != (format_member & (TAR_GNU1997_FORMAT|TAR_GNU2004_FORMAT)))
+       &&  ('S' == typeFlag) ) {
+
+        if( (0 == tar->prefix[40])
+        &&  (0 != tar->prefix[137])
+        &&  (12 <= tar_octalvalue(tar->prefix + 41, 12, NULL))
+        &&  (12 <= tar_octalvalue(tar->prefix + 53, 12, NULL)) ) {
+          /*
+           * fsize needs adjustment when there are more than 4 sparse blocks
+           */
+          size_t diffpos = 0;
+          fsize += TAR_HEADER_SIZE;
+ 
+          while ( (pos + diffpos + TAR_HEADER_SIZE < size)
+               && (0 != *(data + pos + diffpos + 504)) ) {
+            diffpos += TAR_HEADER_SIZE;
+            fsize   += TAR_HEADER_SIZE;
+          }
+        }
+
+        typeFlag = '0';
+
+      } else if(0 != (format_member & TAR_SCHILLING1994_FORMAT) ) {
+
+       nul_pos = memchr(tar->prefix, 0, 130);
+        tar_prefix_length = (0 == nul_pos)
+                          ? 130
+                          : (nul_pos - tar->prefix);
+
+        if ('S' == typeFlag)
+          typeFlag = '0';
+
+      } else if(0 != (format_member & TAR_SCHILLING1985_FORMAT) ) {
+
+       nul_pos = memchr(tar->prefix, 0, 155);
+        tar_prefix_length = (0 == nul_pos)
+                          ? 155
+                          : (nul_pos - tar->prefix);
+
+
+        if ('S' == typeFlag)
+          typeFlag = '0';
+
+      } else if (0 != (format_member & TAR_POSIX1988_FORMAT) ) {
+
+        nul_pos = memchr(tar->prefix, 0, sizeof tar->prefix);
+        tar_prefix_length = (0 == nul_pos)
+                          ? sizeof tar->prefix
+                          : nul_pos - tar->prefix;
+
+      }
     }
 
-    if ( (pos + fsize > size) ||
-        (fsize > size) ||
-        (pos + fsize < pos) )
+    /*
+     * Update position so that next loop iteration will find
+     * either a TAR header or TAR EOF mark or just EOF.
+     *
+     * Consider archive member size to be zero
+     * with no data following the header in the following cases :
+     * '1' : hard link, '2' : soft link,
+     * '3' : character device, '4' : block device,
+     * '5' : directory, '6' : named pipe.
+     */
+    if('1' != typeFlag && '2' != typeFlag
+    && '3' != typeFlag && '4' != typeFlag
+    && '5' != typeFlag && '6' != typeFlag) {
+      if ( (fsize > (unsigned long long) size)
+        || (fsize + (unsigned long long) pos > (unsigned long long) size) )
+        break;
+
+      pos += tar_roundup((size_t) fsize);
+    }
+    if(pos - 1 > size)
       break;
 
-    if (0 < ustar_prefix_length + tar_name_length) {
-      char * fname = malloc(1 + ustar_prefix_length + tar_name_length);
+    format_archive |= format_member;
 
-      if (NULL != fname) {
-         if(0 < ustar_prefix_length)
-           memcpy(fname, ustar_prefix, ustar_prefix_length);
-         if(0 < tar_name_length)
-           memcpy(fname + ustar_prefix_length, tar->name, tar_name_length);
-         fname[ustar_prefix_length + tar_name_length]= '\0';
-         last = appendKeyword(EXTRACTOR_FILENAME, fname, last);
-         contents_are_empty = 0;
-        if (prev == NULL)
-          prev = last;
+    /*
+     * Store the file name in libextractor list.
+     *
+     * For the time being, only file types listed in POSIX.1-1988 ('0'..'7')
+     * are retained, leaving out labels, access control lists, etc.
+     */
+    if ( (0 == typeFlag) || (('0' <= typeFlag) && (typeFlag <= '7')) ) {
+      if (NULL == fname) {
+        if (0 < tar_prefix_length + tar_name_length) {
+          fname = malloc(2 + tar_prefix_length + tar_name_length);
+
+          if (NULL != fname) {
+            if (0 < tar_prefix_length) {
+              memcpy(fname, tar->prefix, tar_prefix_length);
+
+              if ( ('/' != tar->prefix[tar_prefix_length - 1])
+                && (0 < tar_name_length)
+                && ('/' != tar->fileName[0]) ) {
+                fname[tar_prefix_length] = '/';
+                tar_prefix_length += 1;
+              }
+            }
+
+            if (0 < tar_name_length)
+              memcpy(fname + tar_prefix_length, tar->fileName, 
tar_name_length);
+
+            fname[tar_prefix_length + tar_name_length]= '\0';
+          }
+        }
       }
+
+      if ( (NULL != fname) && (0 != *fname) ) {
+#if 0
+        fprintf(stdout,
+            "(%u) flag = %c, size = %u, tname = (%s), fname = (%s)\n",
+           __LINE__, typeFlag, (unsigned int) fsize,
+            (NULL == tar->fileName) ? "" : tar->fileName,
+            (NULL == fname) ? "" : fname);
+#endif
+
+        last = appendKeyword(EXTRACTOR_FILENAME, fname, last);
+        fname = NULL;
+       if (prev == NULL)
+         prev = last;
+        if (ftime > maxftime)
+          maxftime = ftime;
+        contents_are_empty = 0;
+      }
     }
 
-    if ( (fsize & 511) != 0)
-      fsize = (fsize | 511)+1; /* round up! */
-    if (pos + fsize < pos)
-      break;
-    pos += fsize;
+    if(NULL != fname) {
+      free(fname);
+      fname = NULL;
+    }
   }
 
+  if(NULL != fname) {
+    free(fname);
+    fname = NULL;
+  }
+
   /*
-   * we only report mimetype when at least one archive member was found;
-   * this should avoid most magic number ambiguities (more checks needed).
+   * Report mimetype; report also format(s) and most recent date
+   * when at least one archive member was found.
    */
-  if ( (NULL != mimetype) && (0 == contents_are_empty) )
-    prev = addKeyword(EXTRACTOR_MIMETYPE, strdup(mimetype), prev);
+  if (0 != format_archive) {
+    if (0 == contents_are_empty) {
 
+      const char *formats[5] = {NULL, NULL, NULL, NULL, NULL};
+      unsigned int formats_count = 0;
+      unsigned int formats_u     = 0;
+      unsigned int format_length = 0;
+      char *format = NULL;
+
+      if(TAR_TIME_FENCE < maxftime) {
+        char iso8601_time[24];
+
+        if(0 == tar_time(maxftime, iso8601_time, sizeof iso8601_time))
+          prev = addKeyword(EXTRACTOR_DATE, strdup(iso8601_time), prev);
+      }
+
+      /*
+       * We only keep the most recent POSIX format.
+       */
+      if (0 != (format_archive & TAR_POSIX2001_FORMAT))
+        formats[formats_count++] = "POSIX 2001";
+
+      else if (0 != (format_archive & TAR_POSIX1988_FORMAT))
+        formats[formats_count++] = "POSIX 1988";
+
+      /*
+       * We only keep the most recent GNU format.
+       */
+      if (0 != (format_archive & TAR_GNU2004_FORMAT))
+        formats[formats_count++] = "GNU 2004";
+
+      else if (0 != (format_archive & TAR_GNU1997_FORMAT))
+        formats[formats_count++] = "GNU 1997";
+
+      else if (0 != (format_archive & TAR_GNU1991_FORMAT))
+        formats[formats_count++] = "GNU 1991";
+
+      /*
+       * We only keep the most recent Schilling format.
+       */
+      if (0 != (format_archive & TAR_SCHILLING2001_FORMAT))
+        formats[formats_count++] = "Schilling 2001";
+
+      else if (0 != (format_archive & TAR_SCHILLING1994_FORMAT))
+        formats[formats_count++] = "Schilling 1994";
+
+      else if (0 != (format_archive & TAR_SCHILLING1985_FORMAT))
+        formats[formats_count++] = "Schilling 1985";
+
+      /*
+       * We only keep the most recent Solaris format.
+       */
+      if (0 != (format_archive & TAR_SOLARIS2001_FORMAT))
+        formats[formats_count++] = "Solaris 2001";
+
+      /*
+       * We only keep the (supposedly) most recent UNIX V7 format.
+       */
+      if (0 != (format_archive & TAR_V7EXTENDED_FORMAT))
+        formats[formats_count++] = "UNIX extended V7";
+
+      else if (0 != (format_archive & TAR_V7ORIGINAL_FORMAT))
+        formats[formats_count++] = "UNIX original V7";
+
+      /*
+       * Build the format string
+       */
+      for(formats_u = 0; formats_u < formats_count; formats_u += 1) {
+        if( (NULL != formats[formats_u]) && (0 != *formats[formats_u]) ) {
+          if (0 < format_length)
+            format_length += 3;
+          format_length += strlen(formats[formats_u]);
+        }
+      }
+
+      if(0 < format_length)
+      {
+        format = malloc(format_length + 5);
+
+        if (NULL != format) {
+
+          format_length = 0;
+
+          for(formats_u = 0; formats_u < formats_count; formats_u += 1) {
+            if( (NULL != formats[formats_u]) && (0 != *formats[formats_u]) ) {
+              if (0 < format_length) {
+                strcpy(format + format_length, " + ");
+                format_length += 3;
+              }
+              strcpy(format + format_length, formats[formats_u]);
+              format_length += strlen(formats[formats_u]);
+            }
+          }
+
+          if(0 < format_length) {
+            strcpy(format + format_length, " TAR");
+            prev = addKeyword(EXTRACTOR_FORMAT, format, prev);
+          }
+        }
+      }
+    }
+
+    prev = addKeyword(EXTRACTOR_MIMETYPE, strdup("application/x-tar"), prev);
+  }
+
   return prev;
 }





reply via email to

[Prev in Thread] Current Thread [Next in Thread]