gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r23289 - in Extractor: . src/plugins src/plugins/old src/pl


From: gnunet
Subject: [GNUnet-SVN] r23289 - in Extractor: . src/plugins src/plugins/old src/plugins/testdata test
Date: Fri, 17 Aug 2012 18:25:36 +0200

Author: grothoff
Date: 2012-08-17 18:25:36 +0200 (Fri, 17 Aug 2012)
New Revision: 23289

Added:
   Extractor/src/plugins/html_extractor.c
   Extractor/src/plugins/man_extractor.c
   Extractor/src/plugins/riff_extractor.c
   Extractor/src/plugins/test_html.c
   Extractor/src/plugins/test_man.c
   Extractor/src/plugins/testdata/html_grothoff.html
   Extractor/src/plugins/testdata/man_extract.1
Removed:
   Extractor/src/plugins/old/html_extractor.c
   Extractor/src/plugins/old/man_extractor.c
   Extractor/src/plugins/old/riff_extractor.c
   Extractor/test/test.html
Modified:
   Extractor/TODO
   Extractor/configure.ac
   Extractor/src/plugins/
   Extractor/src/plugins/Makefile.am
   Extractor/src/plugins/thumbnailgtk_extractor.c
Log:
work on misc plugins

Modified: Extractor/TODO
===================================================================
--- Extractor/TODO      2012-08-17 14:54:29 UTC (rev 23288)
+++ Extractor/TODO      2012-08-17 16:25:36 UTC (rev 23289)
@@ -1,16 +1,17 @@
 * Update plugins to new API (and cleanup code):
-  - thumbnail-ffmpeg
-  - thumbnail-qt
   - tar
-  - html
-  - man
   - dvi
   - elf
   - applefile
-  - riff
   - ps
   - pdf
 
+* plugins without tests:
+  - gstreamer (testcase file exists, but does nothing)
+  - riff
+  - sid
+
+
 Desirable missing formats:
 * mbox / various e-mail formats
 * info pages (scan for 'Node: %s^?ID' - see end of .info files!)

Modified: Extractor/configure.ac
===================================================================
--- Extractor/configure.ac      2012-08-17 14:54:29 UTC (rev 23288)
+++ Extractor/configure.ac      2012-08-17 16:25:36 UTC (rev 23289)
@@ -342,6 +342,14 @@
    AM_CONDITIONAL(HAVE_MAGIC, false))],
   AM_CONDITIONAL(HAVE_MAGIC, false))
 
+AC_MSG_CHECKING(for tidyInitSource -ltidy)
+SAVED_LDFLAGS=$LDFLAGS
+AC_CHECK_LIB(tidy, tidyInitSource,
+  [AC_CHECK_HEADERS([tidy/tidy.h],
+   AM_CONDITIONAL(HAVE_TIDY, true),
+   AM_CONDITIONAL(HAVE_TIDY, false))],
+  AM_CONDITIONAL(HAVE_TIDY, false))
+
 # restore LIBS
 LIBS=$LIBSOLD
 
@@ -449,7 +457,7 @@
 then
   AC_DEFINE_UNQUOTED([HAVE_GTK], 1, [We have GTK])
 else
-  AC_MSG_ERROR(Cannot find GTK: Is pkg-config in path?)
+  AC_MSG_NOTICE([Cannot find GTK: Is pkg-config in path?])
 fi
 
 CFLAGS="$CFLAGS $GTK_CFLAGS"


Property changes on: Extractor/src/plugins
___________________________________________________________________
Modified: svn:ignore
   - test_tiff
test_thumbnailgtk
test_thumbnailffmpeg
test_zip
test_xm
test_png
test_odf
test_nsfe
test_nsf
test_deb
test_rpm
test_flac
test_gif
test_mime
test_ogg
thumbnailextractorqt.loT
Makefile.in
Makefile
.deps

   + test_wav
test_s3m
test_ole2
test_mpeg
test_man
test_jpeg
test_it
test_html
test_exiv2
test_tiff
test_thumbnailgtk
test_thumbnailffmpeg
test_zip
test_xm
test_png
test_odf
test_nsfe
test_nsf
test_deb
test_rpm
test_flac
test_gif
test_mime
test_ogg
thumbnailextractorqt.loT
Makefile.in
Makefile
.deps


Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am   2012-08-17 14:54:29 UTC (rev 23288)
+++ Extractor/src/plugins/Makefile.am   2012-08-17 16:25:36 UTC (rev 23289)
@@ -36,7 +36,9 @@
   testdata/nsf_arkanoid.nsf \
   testdata/nsfe_classics.nsfe \
   testdata/xm_diesel.xm \
-  testdata/tiff_haute.tiff
+  testdata/tiff_haute.tiff \
+  testdata/man_extract.1 \
+  testdata/html_grothoff.html
 
 if HAVE_VORBISFILE
 PLUGIN_OGG=libextractor_ogg.la
@@ -58,7 +60,12 @@
 PLUGIN_FFMPEG=libextractor_thumbnailffmpeg.la
 TEST_FFMPEG=test_thumbnailffmpeg
 endif
+
+if HAVE_TIDY
+PLUGIN_HTML=libextractor_html.la
+TEST_HTML=test_html
 endif
+endif
 
 if HAVE_GIF
 PLUGIN_GIF=libextractor_gif.la
@@ -105,6 +112,7 @@
 TEST_ZLIB=test_deb
 endif
 
+
 if HAVE_GSTREAMER
 PLUGIN_GSTREAMER=libextractor_gstreamer.la
 TEST_GSTREAMER=test_gstreamer
@@ -112,6 +120,7 @@
 
 plugin_LTLIBRARIES = \
   libextractor_it.la \
+  libextractor_man.la \
   libextractor_nsf.la \
   libextractor_nsfe.la \
   libextractor_odf.la \
@@ -119,9 +128,11 @@
   libextractor_xm.la \
   libextractor_s3m.la \
   libextractor_sid.la \
+  libextractor_riff.la \
   libextractor_wav.la \
   libextractor_zip.la \
   $(PLUGIN_GTK) \
+  $(PLUGIN_HTML) \
   $(PLUGIN_FFMPEG) \
   $(PLUGIN_ZLIB) \
   $(PLUGIN_OGG) \
@@ -142,6 +153,7 @@
 
 check_PROGRAMS = \
   test_wav \
+  test_man \
   test_it \
   test_s3m \
   test_png \
@@ -151,6 +163,7 @@
   test_nsf \
   test_nsfe \
   $(TEST_ZLIB) \
+  $(TEST_HTML) \
   $(TEST_GTK) \
   $(TEST_FFMPEG) \
   $(TEST_OGG) \
@@ -201,6 +214,17 @@
   $(top_builddir)/src/plugins/libtest.la
 
 
+libextractor_man_la_SOURCES = \
+  man_extractor.c
+libextractor_man_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+
+test_man_SOURCES = \
+  test_man.c
+test_man_LDADD = \
+  $(top_builddir)/src/plugins/libtest.la
+
+
 libextractor_nsf_la_SOURCES = \
   nsf_extractor.c
 libextractor_nsf_la_LDFLAGS = \
@@ -279,6 +303,12 @@
   $(PLUGINFLAGS)
 
 
+libextractor_riff_la_SOURCES = \
+  riff_extractor.c
+libextractor_riff_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+
+
 libextractor_s3m_la_SOURCES = \
   s3m_extractor.c
 libextractor_s3m_la_LDFLAGS = \
@@ -477,3 +507,16 @@
   test_thumbnailffmpeg.c
 test_thumbnailffmpeg_LDADD = \
   $(top_builddir)/src/plugins/libtest.la
+
+
+libextractor_html_la_SOURCES = \
+  html_extractor.c
+libextractor_html_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+libextractor_html_la_LIBADD = \
+  -ltidy -lmagic
+
+test_html_SOURCES = \
+  test_html.c
+test_html_LDADD = \
+  $(top_builddir)/src/plugins/libtest.la

Copied: Extractor/src/plugins/html_extractor.c (from rev 23273, 
Extractor/src/plugins/old/html_extractor.c)
===================================================================
--- Extractor/src/plugins/html_extractor.c                              (rev 0)
+++ Extractor/src/plugins/html_extractor.c      2012-08-17 16:25:36 UTC (rev 
23289)
@@ -0,0 +1,694 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian 
Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+
+ */
+/**
+ * @file plugins/html_extractor.c
+ * @brief plugin to support HTML files
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <magic.h>
+#include <tidy/tidy.h>
+#include <tidy/buffio.h>
+
+/**
+ * Mapping of HTML META names to LE types.
+ */
+static struct
+{
+  /**
+   * HTML META name.
+   */
+  const char *name;
+
+  /**
+   * Corresponding LE type.
+   */
+  enum EXTRACTOR_MetaType type;
+} tagmap[] = {
+  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "title", EXTRACTOR_METATYPE_TITLE },
+  { "dc.title", EXTRACTOR_METATYPE_TITLE},
+  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
+  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
+  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
+  { "rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
+  { "language", EXTRACTOR_METATYPE_LANGUAGE },  
+  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
+  { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
+  { "dc.identifier", EXTRACTOR_METATYPE_URI },
+  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
+  { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+
+
+/**
+ * Global handle to MAGIC data.
+ */
+static magic_t magic;
+
+
+/**
+ * Map 'meta' tag to LE type.
+ *
+ * @param tag tag to map
+ * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
+ */
+static enum EXTRACTOR_MetaType 
+tag_to_type (const char *tag)
+{
+  unsigned int i;
+
+  for (i=0; NULL != tagmap[i].name; i++)
+    if (0 == strcasecmp (tag,
+                        tagmap[i].name))
+      return tagmap[i].type;
+  return EXTRACTOR_METATYPE_RESERVED;
+}
+
+
+/**
+ * Function called by libtidy for error reporting.
+ *
+ * @param doc tidy doc being processed
+ * @param lvl report level
+ * @param line input line
+ * @param col input column
+ * @param mssg message
+ * @return FALSE (no output)
+ */
+static Bool
+report_cb (TidyDoc doc,
+          TidyReportLevel lvl,
+          uint line,
+          uint col,
+          ctmbstr mssg)
+{
+  return 0;
+}
+
+
+/**
+ * Input callback: get next byte of input.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @return next byte of input, EndOfStream on errors and EOF
+ */
+static int
+get_byte_cb (void *sourceData)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+  void *data;
+
+  if (1 !=
+      ec->read (ec->cls,
+               &data, 1))
+    return EndOfStream;
+  return *(unsigned char*) data;
+}
+
+
+/**
+ * Input callback: unget last byte of input.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @param bt byte to unget (ignored)
+ */
+static void
+unget_byte_cb (void *sourceData, byte bt)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+  
+  (void) ec->seek (ec->cls, -1, SEEK_CUR);
+}
+
+
+/**
+ * Input callback: check for EOF.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @return true if we are at the EOF
+ */
+static Bool
+eof_cb (void *sourceData)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+
+  return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
+}
+
+
+/**
+ * Main entry method for the 'text/html' extraction plugin.  
+ *
+ * @param ec extraction context provided to the plugin
+ */
+void 
+EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  TidyDoc doc;
+  TidyNode head;
+  TidyNode child;
+  TidyNode title;
+  TidyInputSource src;
+  const char *name;
+  TidyBuffer tbuf;
+  TidyAttr attr;
+  enum EXTRACTOR_MetaType type;
+  ssize_t iret;
+  void *data;
+  const char *mime;
+
+  if (-1 == (iret = ec->read (ec->cls,
+                             &data,
+                             16 * 1024)))
+    return;
+  if (NULL == (mime = magic_buffer (magic, data, iret)))
+    return;
+  if (0 != strncmp (mime,
+                   "text/html",
+                   strlen ("text/html")))
+    return; /* not HTML */
+
+  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
+    return; /* seek failed !? */
+
+  tidyInitSource (&src, ec,
+                 &get_byte_cb,
+                 &unget_byte_cb,
+                 &eof_cb);
+  if (NULL == (doc = tidyCreate ()))
+    return;
+  tidySetReportFilter (doc, &report_cb);
+  tidySetAppData (doc, ec);
+  if (0 > tidyParseSource (doc, &src))
+    {
+      tidyRelease (doc);
+      return;
+    }
+  if (1 != tidyStatus (doc))
+    {
+      tidyRelease (doc);
+      return;
+    }
+  if (NULL == (head = tidyGetHead (doc)))
+    {
+      fprintf (stderr, "no head\n");
+      tidyRelease (doc);
+      return;
+    }
+  for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
+    {
+      switch (tidyNodeGetType(child))
+       {
+       case TidyNode_Root:
+         break;
+       case TidyNode_DocType:
+         break;
+       case TidyNode_Comment:
+         break;
+       case TidyNode_ProcIns:
+         break;
+       case TidyNode_Text:
+         break;
+       case TidyNode_CDATA:
+         break;
+       case TidyNode_Section:
+         break;
+       case TidyNode_Asp:
+         break;
+       case TidyNode_Jste:
+         break;
+       case TidyNode_Php:
+         break;
+       case TidyNode_XmlDecl:
+         break;          
+       case TidyNode_Start:
+       case TidyNode_StartEnd: 
+         name = tidyNodeGetName (child);
+         if ( (0 == strcasecmp (name, "title")) &&
+              (NULL != (title = tidyGetChild (child))) )
+           {
+             tidyBufInit (&tbuf);
+             tidyNodeGetValue (doc, title, &tbuf);
+             /* add 0-termination */
+             tidyBufPutByte (&tbuf, 0);
+             if (0 !=
+                 ec->proc (ec->cls,
+                           "html",
+                           EXTRACTOR_METATYPE_TITLE,
+                           EXTRACTOR_METAFORMAT_UTF8,
+                           "text/plain",
+                           (const char *) tbuf.bp,
+                           tbuf.size))
+               {
+                 tidyBufFree (&tbuf);
+                 goto CLEANUP;
+               }
+             tidyBufFree (&tbuf);
+             break;
+           }
+         if (0 == strcasecmp (name, "meta"))
+           {
+             if (NULL == (attr = tidyAttrGetById (child, 
+                                                  TidyAttr_NAME)))
+               break;
+             if (EXTRACTOR_METATYPE_RESERVED == 
+                 (type = tag_to_type (tidyAttrValue (attr))))
+               break;
+             if (NULL == (attr = tidyAttrGetById (child, 
+                                                  TidyAttr_CONTENT)))
+               break;
+             name = tidyAttrValue (attr);
+             if (0 !=
+                 ec->proc (ec->cls,
+                           "html",
+                           type,
+                           EXTRACTOR_METAFORMAT_UTF8,
+                           "text/plain",
+                           name,
+                           strlen (name) + 1))
+               goto CLEANUP;
+             break;    
+           }
+         break;
+       case TidyNode_End:
+         break;          
+       default:
+         break;
+       }      
+    }
+ CLEANUP:
+  tidyRelease (doc);
+}
+
+
+
+#if OLD
+
+
+/* ******************** parser helper functions ************** */
+
+static int
+tagMatch (const char *tag, const char *s, const char *e)
+{
+  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
+}
+
+static int
+lookFor (char c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (data[p] != c))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+skipWhitespace (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (isspace ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+skipLetters (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (isalpha ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (strchr (c, data[p]) == NULL))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static void
+findEntry (const char *key,
+           const char *start,
+           const char *end, const char **mstart, const char **mend)
+{
+  size_t len;
+
+  *mstart = NULL;
+  *mend = NULL;
+  len = strlen (key);
+  while (start < end - len - 1)
+    {
+      start++;
+      if (start[len] != '=')
+        continue;
+      if (0 == strncasecmp (start, key, len))
+        {
+          start += len + 1;
+          *mstart = start;
+          if ((*start == '\"') || (*start == '\''))
+            {
+              start++;
+              while ((start < end) && (*start != **mstart))
+                start++;
+              (*mstart)++;      /* skip quote */
+            }
+          else
+            {
+              while ((start < end) && (!isspace ( (unsigned char) *start)))
+                start++;
+            }
+          *mend = start;
+          return;
+        }
+    }
+}
+
+/**
+ * Search all tags that correspond to "tagname".  Example:
+ * If the tag is <meta name="foo" desc="bar">, and
+ * tagname == "meta", keyname="name", keyvalue="foo",
+ * and searchname="desc", then this function returns a
+ * copy (!) of "bar".  Easy enough?
+ *
+ * @return NULL if nothing is found
+ */
+static char *
+findInTags (struct TagInfo * t,
+            const char *tagname,
+            const char *keyname, const char *keyvalue, const char *searchname)
+{
+  const char *pstart;
+  const char *pend;
+
+  while (t != NULL)
+    {
+      if (tagMatch (tagname, t->tagStart, t->tagEnd))
+        {
+          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+            {
+              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+              if (pstart != NULL)
+                {
+                  char *ret = malloc (pend - pstart + 1);
+                 if (ret == NULL)
+                   return NULL;
+                  memcpy (ret, pstart, pend - pstart);
+                  ret[pend - pstart] = '\0';
+                  return ret;
+                }
+            }
+        }
+      t = t->next;
+    }
+  return NULL;
+}
+
+
+/* mimetype = text/html */
+int 
+EXTRACTOR_html_extract (const char *data,
+                       size_t size,
+                       EXTRACTOR_MetaDataProcessor proc,
+                       void *proc_cls,
+                       const char *options)
+{
+  size_t xsize;
+  struct TagInfo *tags;
+  struct TagInfo *t;
+  struct TagInfo tag;
+  size_t pos;
+  size_t tpos;
+  int i;
+  char *charset;
+  char *tmp;
+  char *xtmp;
+  int ret;
+
+  ret = 0;
+  if (size == 0)
+    return 0;
+  /* only scan first 32k */
+  if (size > 1024 * 32)
+    xsize = 1024 * 32;
+  else
+    xsize = size;
+  tags = NULL;
+  tag.next = NULL;
+  pos = 0;
+  while (pos < xsize)
+    {
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.tagStart = &data[++pos];
+      if (!skipLetters (&pos, data, size))
+        break;
+      tag.tagEnd = &data[pos];
+      if (!skipWhitespace (&pos, data, size))
+        break;
+    STEP3:
+      if (!lookForMultiple (">\"\'", &pos, data, size))
+        break;
+      if (data[pos] != '>')
+        {
+          /* find end-quote, ignore escaped quotes (\') */
+          do
+            {
+              tpos = pos;
+              pos++;
+              if (!lookFor (data[tpos], &pos, data, size))
+                break;
+            }
+          while (data[pos - 1] == '\\');
+          pos++;
+          goto STEP3;
+        }
+      pos++;
+      if (!skipWhitespace (&pos, data, size))
+        break;
+      tag.dataStart = &data[pos];
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.dataEnd = &data[pos];
+      i = 0;
+      while (relevantTags[i] != NULL)
+        {
+          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+              (0 == strncasecmp (relevantTags[i],
+                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
+            {
+              t = malloc (sizeof (struct TagInfo));
+             if (t == NULL)
+               return 0;
+              *t = tag;
+              t->next = tags;
+              tags = t;
+              break;
+            }
+          i++;
+        }
+      /* abort early if we hit the body tag */
+      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
+        break;
+    }
+
+  /* fast exit */
+  if (tags == NULL)
+    return 0;
+
+  charset = NULL;
+  /* first, try to determine mime type and/or character set */
+  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
+  if (tmp != NULL)
+    {
+      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like 
that;
+         if text/html is present, we take that as the mime-type; if charset=
+         is present, we try to use that for character set conversion. */
+      if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
+        ret = proc (proc_cls, 
+                   "html",
+                   EXTRACTOR_METATYPE_MIMETYPE,
+                   EXTRACTOR_METAFORMAT_UTF8,
+                   "text/plain",
+                   "text/html",
+                   strlen ("text/html")+1);
+      charset = strcasestr (tmp, "charset=");
+      if (charset != NULL)
+        charset = strdup (&charset[strlen ("charset=")]);
+      free (tmp);
+    }
+  i = 0;
+  while (tagmap[i].name != NULL)
+    {
+      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+      if ( (tmp != NULL) &&
+          (ret == 0) )
+        {
+         if (charset == NULL)
+           {
+             ret = proc (proc_cls,
+                         "html",
+                         tagmap[i].type,
+                         EXTRACTOR_METAFORMAT_C_STRING,
+                         "text/plain",
+                         tmp,
+                         strlen (tmp) + 1);
+           }
+         else
+           {
+             xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                      strlen (tmp),
+                                                      charset);
+             if (xtmp != NULL)
+               {
+                 ret = proc (proc_cls,
+                             "html",
+                             tagmap[i].type,
+                             EXTRACTOR_METAFORMAT_UTF8,
+                             "text/plain",
+                             xtmp,
+                             strlen (xtmp) + 1);
+                 free (xtmp);
+               }
+           }
+        }
+      if (tmp != NULL)
+       free (tmp);
+      i++;
+    }
+  while (tags != NULL) 
+    {
+      t = tags;
+      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+          (ret == 0) )
+       {
+         if (charset == NULL)
+           {
+             xtmp = malloc (t->dataEnd - t->dataStart + 1);
+             if (xtmp != NULL)
+               {
+                 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
+                 xtmp[t->dataEnd - t->dataStart] = '\0';
+                 ret = proc (proc_cls,
+                             "html",
+                             EXTRACTOR_METATYPE_TITLE,
+                             EXTRACTOR_METAFORMAT_C_STRING,
+                             "text/plain",
+                             xtmp,
+                             strlen (xtmp) + 1);
+                 free (xtmp);
+               }
+           }
+         else
+           {
+             xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
+                                                      t->dataEnd - 
t->dataStart,
+                                                      charset);
+             if (xtmp != NULL)
+               {
+                 ret = proc (proc_cls,
+                             "html",
+                             EXTRACTOR_METATYPE_TITLE,
+                             EXTRACTOR_METAFORMAT_UTF8,
+                             "text/plain",
+                             xtmp,
+                             strlen (xtmp) + 1);
+                 free (xtmp);
+               }
+           }
+       }
+      tags = t->next;
+      free (t);
+    }
+  if (charset != NULL)
+    free (charset);
+  return ret;
+}
+#endif
+
+
+/**
+ * Initialize glib and load magic file.
+ */
+void __attribute__ ((constructor)) 
+html_gobject_init ()
+{
+  magic = magic_open (MAGIC_MIME_TYPE);
+  if (0 != magic_load (magic, NULL))
+    {
+      /* FIXME: how to deal with errors? */
+    }
+}
+
+
+/**
+ * Destructor for the library, cleans up.
+ */
+void __attribute__ ((destructor)) 
+html_ltdl_fini () 
+{
+  if (NULL != magic)
+    {
+      magic_close (magic);
+      magic = NULL;
+    }
+}
+
+/* end of html_extractor.c */

Copied: Extractor/src/plugins/man_extractor.c (from rev 23273, 
Extractor/src/plugins/old/man_extractor.c)
===================================================================
--- Extractor/src/plugins/man_extractor.c                               (rev 0)
+++ Extractor/src/plugins/man_extractor.c       2012-08-17 16:25:36 UTC (rev 
23289)
@@ -0,0 +1,292 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+/**
+ * @file plugins/man_extractor.c
+ * @brief plugin to support man pages
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <ctype.h>
+
+
+/**
+ * Create string from first 'n' characters of 'str'.  See 'strndup'.
+ *
+ * @param str input string
+ * @param n desired output length (plus 0-termination)
+ * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
+ */
+static char *
+stndup (const char *str, size_t n)
+{
+  char *tmp;
+
+  if (NULL == (tmp = malloc (n + 1)))
+    return NULL;
+  tmp[n] = '\0';
+  memcpy (tmp, str, n);
+  return tmp;
+}
+
+
+/**
+ * Give a metadata item to LE.  Removes double-quotes and
+ * makes sure we don't pass empty strings or NULL pointers.
+ *
+ * @param type metadata type to use
+ * @param keyword metdata value; freed in the process
+ * @param proc function to call with meta data
+ * @param proc_cls closure for 'proc'
+ * @return 0 to continue extracting, 1 if we are done
+ */
+static int
+add_keyword (enum EXTRACTOR_MetaType type,
+            char *keyword, 
+            EXTRACTOR_MetaDataProcessor proc,
+            void *proc_cls)
+{
+  int ret;
+  char *value;
+  
+  if (NULL == keyword)
+    return 0;
+  if ( (keyword[0] == '\"') && 
+       (keyword[strlen (keyword) - 1] == '\"') )
+    {
+      keyword[strlen (keyword) - 1] = '\0';
+      value = &keyword[1];
+    }
+  else
+    value = keyword;
+  if (0 == strlen (value))
+    {
+      free (keyword);
+      return 0;
+    }
+  ret = proc (proc_cls, 
+             "man",
+             type,
+             EXTRACTOR_METAFORMAT_UTF8,
+             "text/plain",
+             value,
+             strlen (value)+1);
+  free (keyword);
+  return ret;
+}
+
+
+/**
+ * Find the end of the current token (which may be quoted).
+ *
+ * @param end beginning of the current token, updated to its end; set to size 
+ 1 if the token does not end properly
+ * @param buf input buffer with the characters
+ * @param size number of bytes in buf
+ */
+static void
+find_end_of_token (size_t *end,
+                  const char *buf, 
+                  const size_t size)
+{
+  int quot;
+
+  quot = 0;
+  while ( (*end < size) &&
+         ( (0 != (quot & 1)) ||
+           ((' ' != buf[*end])) ) )
+    {
+      if ('\"' == buf[*end])
+        quot++;
+      (*end)++;
+    }
+  if (1 == (quot & 1))
+    (*end) = size + 1;
+}
+
+
+/**
+ * How many bytes do we actually try to scan? (from the beginning
+ * of the file).
+ */
+#define MAX_READ (16 * 1024)
+
+
+/**
+ * Add a keyword to LE.
+ * 
+ * @param t type to use
+ * @param s keyword to give to LE
+ */
+#define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; 
} while (0)
+
+
+/**
+ * Main entry method for the man page extraction plugin.  
+ *
+ * @param ec extraction context provided to the plugin
+ */
+void
+EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  const size_t xlen = strlen (".TH ");
+  size_t pos;
+  size_t xsize;
+  size_t end;
+  void *data;
+  ssize_t size;
+  char *buf;
+  
+  if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
+    return;
+  buf = data;
+  pos = 0;
+  if (size < xlen)
+    return;
+  /* find actual beginning of the man page (.TH);
+     abort if we find non-printable characters */
+  while ( (pos < size - xlen) &&
+         ( (0 != strncmp (".TH ",
+                          &buf[pos],
+                          xlen)) || 
+           ( (0 != pos) && 
+             (buf[pos - 1] != '\n') ) ) )
+    {
+      if ( (! isgraph ((unsigned char) buf[pos])) && 
+          (! isspace ((unsigned char) buf[pos])) )
+        return;
+      pos++;
+    }
+  if (0 != strncmp (".TH ", &buf[pos], xlen))
+    return;
+
+  /* find end of ".TH"-line */
+  xsize = pos;
+  while ( (xsize < size) && ('\n' != buf[xsize]) )
+    xsize++;
+  /* limit processing to ".TH" line */
+  size = xsize;
+
+  /* skip over ".TH" */
+  pos += xlen;
+
+  /* first token is the title */
+  end = pos;
+  find_end_of_token (&end, buf, size);
+  if (end > size)
+    return;
+  if (end > pos)
+    {
+      ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
+      pos = end + 1;
+    }
+  if (pos >= size)
+    return;
+  
+  /* next token is the section */
+  end = pos;
+  find_end_of_token (&end, buf, size);
+  if (end > size)
+    return;
+  if ('\"' == buf[pos])
+    pos++;
+  if ((end - pos >= 1) && (end - pos <= 4))
+    {
+      switch (buf[pos])
+       {
+       case '1':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Commands")));
+         break;
+       case '2':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("System calls")));
+         break;
+       case '3':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Library calls")));
+         break;
+       case '4':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Special files")));
+         break;
+       case '5':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("File formats and conventions")));
+         break;
+       case '6':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Games")));
+         break;
+       case '7':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Conventions and miscellaneous")));
+         break;
+       case '8':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("System management commands")));
+         break;
+       case '9':
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              strdup (_("Kernel routines")));
+         break;
+       default:
+         ADD (EXTRACTOR_METATYPE_SECTION,
+              stndup (&buf[pos], 1));
+       }
+      pos = end + 1;
+    }
+  end = pos;
+
+  /* next token is the modification date */
+  find_end_of_token (&end, buf, size);
+  if (end > size)
+    return;  
+  if (end > pos)
+    {
+      ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - 
pos));
+      pos = end + 1;
+    }
+
+  /* next token is the source of the man page */
+  end = pos;
+  find_end_of_token (&end, buf, size);
+  if (end > size)
+    return;
+  if (end > pos)
+    {
+      ADD (EXTRACTOR_METATYPE_SOURCE,
+          stndup (&buf[pos], end - pos));
+      pos = end + 1;
+    }
+
+  /* last token is the title of the book the man page belongs to */
+  end = pos;
+  find_end_of_token (&end, buf, size);
+  if (end > size)
+    return;
+  if (end > pos)
+    {
+      ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
+          stndup (&buf[pos], end - pos));
+      pos = end + 1;
+    }
+}
+
+/* end of man_extractor.c */

Deleted: Extractor/src/plugins/old/html_extractor.c
===================================================================
--- Extractor/src/plugins/old/html_extractor.c  2012-08-17 14:54:29 UTC (rev 
23288)
+++ Extractor/src/plugins/old/html_extractor.c  2012-08-17 16:25:36 UTC (rev 
23289)
@@ -1,420 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include <string.h>
-#include "convert.h"
-
-static struct
-{
-  const char *name;
-  enum EXTRACTOR_MetaType type;
-} tagmap[] = {
-  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
-  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
-  { "title", EXTRACTOR_METATYPE_TITLE },
-  { "dc.title", EXTRACTOR_METATYPE_TITLE},
-  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
-  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
-  { "subject", EXTRACTOR_METATYPE_SUBJECT},
-  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
-  { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
-  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
-  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
-  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
-  { "rights", EXTRACTOR_METATYPE_RIGHTS },
-  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
-  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
-  { "language", EXTRACTOR_METATYPE_LANGUAGE },  
-  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
-  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
-  { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
-  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
-  { "dc.identifier", EXTRACTOR_METATYPE_URI },
-  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
-  { NULL, EXTRACTOR_METATYPE_RESERVED }
-};
-
-static const char *relevantTags[] = {
-  "title",
-  "meta",
-  NULL,
-};
-
-typedef struct TI
-{
-  struct TI *next;
-  const char *tagStart;
-  const char *tagEnd;
-  const char *dataStart;
-  const char *dataEnd;
-} TagInfo;
-
-
-
-
-/* ******************** parser helper functions ************** */
-
-static int
-tagMatch (const char *tag, const char *s, const char *e)
-{
-  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
-}
-
-static int
-lookFor (char c, size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (data[p] != c))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-skipWhitespace (size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (isspace ( (unsigned char) data[p])))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-skipLetters (size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (isalpha ( (unsigned char) data[p])))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (strchr (c, data[p]) == NULL))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static void
-findEntry (const char *key,
-           const char *start,
-           const char *end, const char **mstart, const char **mend)
-{
-  size_t len;
-
-  *mstart = NULL;
-  *mend = NULL;
-  len = strlen (key);
-  while (start < end - len - 1)
-    {
-      start++;
-      if (start[len] != '=')
-        continue;
-      if (0 == strncasecmp (start, key, len))
-        {
-          start += len + 1;
-          *mstart = start;
-          if ((*start == '\"') || (*start == '\''))
-            {
-              start++;
-              while ((start < end) && (*start != **mstart))
-                start++;
-              (*mstart)++;      /* skip quote */
-            }
-          else
-            {
-              while ((start < end) && (!isspace ( (unsigned char) *start)))
-                start++;
-            }
-          *mend = start;
-          return;
-        }
-    }
-}
-
-/**
- * Search all tags that correspond to "tagname".  Example:
- * If the tag is <meta name="foo" desc="bar">, and
- * tagname == "meta", keyname="name", keyvalue="foo",
- * and searchname="desc", then this function returns a
- * copy (!) of "bar".  Easy enough?
- *
- * @return NULL if nothing is found
- */
-static char *
-findInTags (TagInfo * t,
-            const char *tagname,
-            const char *keyname, const char *keyvalue, const char *searchname)
-{
-  const char *pstart;
-  const char *pend;
-
-  while (t != NULL)
-    {
-      if (tagMatch (tagname, t->tagStart, t->tagEnd))
-        {
-          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
-          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
-            {
-              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
-              if (pstart != NULL)
-                {
-                  char *ret = malloc (pend - pstart + 1);
-                 if (ret == NULL)
-                   return NULL;
-                  memcpy (ret, pstart, pend - pstart);
-                  ret[pend - pstart] = '\0';
-                  return ret;
-                }
-            }
-        }
-      t = t->next;
-    }
-  return NULL;
-}
-
-
-/* mimetype = text/html */
-int 
-EXTRACTOR_html_extract (const char *data,
-                       size_t size,
-                       EXTRACTOR_MetaDataProcessor proc,
-                       void *proc_cls,
-                       const char *options)
-{
-  size_t xsize;
-  TagInfo *tags;
-  TagInfo *t;
-  TagInfo tag;
-  size_t pos;
-  size_t tpos;
-  int i;
-  char *charset;
-  char *tmp;
-  char *xtmp;
-  int ret;
-
-  ret = 0;
-  if (size == 0)
-    return 0;
-  /* only scan first 32k */
-  if (size > 1024 * 32)
-    xsize = 1024 * 32;
-  else
-    xsize = size;
-  tags = NULL;
-  tag.next = NULL;
-  pos = 0;
-  while (pos < xsize)
-    {
-      if (!lookFor ('<', &pos, data, size))
-        break;
-      tag.tagStart = &data[++pos];
-      if (!skipLetters (&pos, data, size))
-        break;
-      tag.tagEnd = &data[pos];
-      if (!skipWhitespace (&pos, data, size))
-        break;
-    STEP3:
-      if (!lookForMultiple (">\"\'", &pos, data, size))
-        break;
-      if (data[pos] != '>')
-        {
-          /* find end-quote, ignore escaped quotes (\') */
-          do
-            {
-              tpos = pos;
-              pos++;
-              if (!lookFor (data[tpos], &pos, data, size))
-                break;
-            }
-          while (data[pos - 1] == '\\');
-          pos++;
-          goto STEP3;
-        }
-      pos++;
-      if (!skipWhitespace (&pos, data, size))
-        break;
-      tag.dataStart = &data[pos];
-      if (!lookFor ('<', &pos, data, size))
-        break;
-      tag.dataEnd = &data[pos];
-      i = 0;
-      while (relevantTags[i] != NULL)
-        {
-          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
-              (0 == strncasecmp (relevantTags[i],
-                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
-            {
-              t = malloc (sizeof (TagInfo));
-             if (t == NULL)
-               return 0;
-              *t = tag;
-              t->next = tags;
-              tags = t;
-              break;
-            }
-          i++;
-        }
-      /* abort early if we hit the body tag */
-      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
-        break;
-    }
-
-  /* fast exit */
-  if (tags == NULL)
-    return 0;
-
-  charset = NULL;
-  /* first, try to determine mime type and/or character set */
-  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
-  if (tmp != NULL)
-    {
-      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like 
that;
-         if text/html is present, we take that as the mime-type; if charset=
-         is present, we try to use that for character set conversion. */
-      if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
-        ret = proc (proc_cls, 
-                   "html",
-                   EXTRACTOR_METATYPE_MIMETYPE,
-                   EXTRACTOR_METAFORMAT_UTF8,
-                   "text/plain",
-                   "text/html",
-                   strlen ("text/html")+1);
-      charset = strcasestr (tmp, "charset=");
-      if (charset != NULL)
-        charset = strdup (&charset[strlen ("charset=")]);
-      free (tmp);
-    }
-  i = 0;
-  while (tagmap[i].name != NULL)
-    {
-      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
-      if ( (tmp != NULL) &&
-          (ret == 0) )
-        {
-         if (charset == NULL)
-           {
-             ret = proc (proc_cls,
-                         "html",
-                         tagmap[i].type,
-                         EXTRACTOR_METAFORMAT_C_STRING,
-                         "text/plain",
-                         tmp,
-                         strlen (tmp) + 1);
-           }
-         else
-           {
-             xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
-                                                      strlen (tmp),
-                                                      charset);
-             if (xtmp != NULL)
-               {
-                 ret = proc (proc_cls,
-                             "html",
-                             tagmap[i].type,
-                             EXTRACTOR_METAFORMAT_UTF8,
-                             "text/plain",
-                             xtmp,
-                             strlen (xtmp) + 1);
-                 free (xtmp);
-               }
-           }
-        }
-      if (tmp != NULL)
-       free (tmp);
-      i++;
-    }
-  while (tags != NULL) 
-    {
-      t = tags;
-      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
-          (ret == 0) )
-       {
-         if (charset == NULL)
-           {
-             xtmp = malloc (t->dataEnd - t->dataStart + 1);
-             if (xtmp != NULL)
-               {
-                 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
-                 xtmp[t->dataEnd - t->dataStart] = '\0';
-                 ret = proc (proc_cls,
-                             "html",
-                             EXTRACTOR_METATYPE_TITLE,
-                             EXTRACTOR_METAFORMAT_C_STRING,
-                             "text/plain",
-                             xtmp,
-                             strlen (xtmp) + 1);
-                 free (xtmp);
-               }
-           }
-         else
-           {
-             xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
-                                                      t->dataEnd - 
t->dataStart,
-                                                      charset);
-             if (xtmp != NULL)
-               {
-                 ret = proc (proc_cls,
-                             "html",
-                             EXTRACTOR_METATYPE_TITLE,
-                             EXTRACTOR_METAFORMAT_UTF8,
-                             "text/plain",
-                             xtmp,
-                             strlen (xtmp) + 1);
-                 free (xtmp);
-               }
-           }
-       }
-      tags = t->next;
-      free (t);
-    }
-  if (charset != NULL)
-    free (charset);
-  return ret;
-}

Deleted: Extractor/src/plugins/old/man_extractor.c
===================================================================
--- Extractor/src/plugins/old/man_extractor.c   2012-08-17 14:54:29 UTC (rev 
23288)
+++ Extractor/src/plugins/old/man_extractor.c   2012-08-17 16:25:36 UTC (rev 
23289)
@@ -1,232 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include <ctype.h>
-
-static char *
-stndup (const char *str, size_t n)
-{
-  char *tmp;
-  tmp = malloc (n + 1);
-  if (tmp == NULL)
-    return NULL;
-  tmp[n] = '\0';
-  memcpy (tmp, str, n);
-  return tmp;
-}
-
-static int
-addKeyword (enum EXTRACTOR_MetaType type,
-            char *keyword, 
-           EXTRACTOR_MetaDataProcessor proc,
-           void *proc_cls)
-{
-  int ret;
-  if (keyword == NULL)
-    return 0;
-  if (strlen (keyword) == 0)
-    {
-      free (keyword);
-      return 0;
-    }
-  if ((keyword[0] == '\"') && (keyword[strlen (keyword) - 1] == '\"'))
-    {
-      char *tmp;
-
-      keyword[strlen (keyword) - 1] = '\0';
-      tmp = strdup (&keyword[1]);
-      free (keyword);
-      if (tmp == NULL)
-       return 0;
-      keyword = tmp;
-    }
-  if (strlen (keyword) == 0)
-    {
-      free (keyword);
-      return 0;
-    }
-  ret = proc (proc_cls, 
-             "man",
-             type,
-             EXTRACTOR_METAFORMAT_UTF8,
-             "text/plain",
-             keyword,
-             strlen (keyword)+1);
-  free (keyword);
-  return ret;
-}
-
-static void
-NEXT (size_t * end, const char *buf, const size_t size)
-{
-  int quot;
-
-  quot = 0;
-  while ((*end < size) && (((quot & 1) != 0) || ((buf[*end] != ' '))))
-    {
-      if (buf[*end] == '\"')
-        quot++;
-      (*end)++;
-    }
-  if ((quot & 1) == 1)
-    (*end) = size + 1;
-}
-
-/**
- * How many bytes do we actually try to scan? (from the beginning
- * of the file).
- */
-#define MAX_READ (16 * 1024)
-
-#define ADD(t,s) do { if (0 != addKeyword (t, s, proc, proc_cls)) return 1; } 
while (0)
-
-int 
-EXTRACTOR_man_extract (const char *buf,
-                      size_t size,
-                      EXTRACTOR_MetaDataProcessor proc,
-                      void *proc_cls,
-                      const char *options)
-{
-  int pos;
-  size_t xsize;
-  const size_t xlen = strlen (".TH ");
-
-  if (size > MAX_READ)
-    size = MAX_READ;
-  pos = 0;
-  if (size < xlen)
-    return 0;
-  while ((pos < size - xlen) &&
-         ((0 != strncmp (".TH ",
-                         &buf[pos],
-                         xlen)) || ((pos != 0) && (buf[pos - 1] != '\n'))))
-    {
-      if (!isgraph ((unsigned char) buf[pos]) && 
-         !isspace ((unsigned char) buf[pos]))
-        return 0;
-      pos++;
-    }
-  xsize = pos;
-  while ((xsize < size) && (buf[xsize] != '\n'))
-    xsize++;
-  size = xsize;
-
-  if (0 == strncmp (".TH ", &buf[pos], xlen))
-    {
-      size_t end;
-
-      pos += xlen;
-      end = pos;
-      NEXT (&end, buf, size);
-      if (end > size)
-        return 0;
-      if (end - pos > 0)
-        {
-          ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
-          pos = end + 1;
-        }
-      if (pos >= size)
-        return 0;
-      end = pos;
-      NEXT (&end, buf, size);
-      if (end > size)
-        return 0;
-      if (buf[pos] == '\"')
-        pos++;
-      if ((end - pos >= 1) && (end - pos <= 4))
-        {
-          switch (buf[pos])
-            {
-            case '1':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                  strdup (_("Commands")));
-              break;
-            case '2':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("System calls")));
-              break;
-            case '3':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("Library calls")));
-              break;
-            case '4':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("Special files")));
-              break;
-            case '5':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("File formats and conventions")));
-              break;
-            case '6':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("Games")));
-              break;
-            case '7':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("Conventions and miscellaneous")));
-              break;
-            case '8':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("System management commands")));
-              break;
-            case '9':
-              ADD (EXTRACTOR_METATYPE_SECTION,
-                                 strdup (_("Kernel routines")));
-              break;
-            }
-          pos = end + 1;
-        }
-      end = pos;
-      NEXT (&end, buf, size);
-      if (end > size)
-        return 0;
-      if (end - pos > 0)
-        {
-          ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - 
pos));
-          pos = end + 1;
-        }
-      end = pos;
-      NEXT (&end, buf, size);
-      if (end > size)
-        return 0;
-      if (end - pos > 0)
-        {
-          ADD (EXTRACTOR_METATYPE_SOURCE,
-              stndup (&buf[pos], end - pos));
-          pos = end + 1;
-        }
-      end = pos;
-      NEXT (&end, buf, size);
-      if (end > size)
-        return 0;
-      if (end - pos > 0)
-        {
-          ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
-              stndup (&buf[pos], end - pos));
-          pos = end + 1;
-        }
-    }
-
-  return 0;
-}
-
-/* end of man_extractor.c */

Deleted: Extractor/src/plugins/old/riff_extractor.c
===================================================================
--- Extractor/src/plugins/old/riff_extractor.c  2012-08-17 14:54:29 UTC (rev 
23288)
+++ Extractor/src/plugins/old/riff_extractor.c  2012-08-17 16:25:36 UTC (rev 
23289)
@@ -1,123 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2004, 2009 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-
-     This code was based on AVInfo 1.0 alpha 11
-     (c) George Shuklin, gs]AT[shounen.ru, 2002-2004
-     http://shounen.ru/soft/avinfo/
-
-     and bitcollider 0.6.0
-     (PD) 2004 The Bitzi Corporation
-     http://bitzi.com/
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include <math.h>
-
-/**
- * Read the specified number of bytes as a little-endian (least
- * significant byte first) integer.
- */
-static unsigned int
-fread_le (const char *data)
-{
-  int x;
-  unsigned int result = 0;
-
-  for (x = 0; x < 4; x++)
-    result |= ((unsigned char) data[x]) << (x * 8);
-  return result;
-}
-
-/* We implement our own rounding function, because the availability of
- * C99's round(), nearbyint(), rint(), etc. seems to be spotty, whereas
- * floor() is available in math.h on all C compilers.
- */
-static double
-round_double (double num)
-{
-  return floor (num + 0.5);
-}
-
-#define ADD(s,t) do { if (0 != (ret = proc (proc_cls, "riff", t, 
EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1))) goto FINISH; } while 
(0)
-
-/* video/x-msvideo */
-int 
-EXTRACTOR_riff_extract (const char *xdata,
-                       size_t xsize,
-                       EXTRACTOR_MetaDataProcessor proc,
-                       void *proc_cls,
-                       const char *options)
-{
-  unsigned int blockLen;
-  unsigned int fps;
-  unsigned int duration;
-  size_t pos;
-  unsigned int width;
-  unsigned int height;
-  char codec[5];
-  char format[256];
-  int ret;
-
-  if (xsize < 32)
-    return 0;
-  if ((memcmp (&xdata[0],
-               "RIFF", 4) != 0) || (memcmp (&xdata[8], "AVI ", 4) != 0))
-    return 0;
-  if (memcmp (&xdata[12], "LIST", 4) != 0)
-    return 0;
-  if (memcmp (&xdata[20], "hdrlavih", 8) != 0)
-    return 0;
-
-  blockLen = fread_le (&xdata[28]);
-
-  /* begin of AVI header at 32 */
-  fps = (unsigned int) round_double ((double) 1.0e6 / fread_le (&xdata[32]));
-  duration = (unsigned int) round_double ((double) fread_le (&xdata[48])
-                                          * 1000 / fps);
-  width = fread_le (&xdata[64]);
-  height = fread_le (&xdata[68]);
-  /* pos: begin of video stream header */
-  pos = blockLen + 32;
-
-  if ((pos < blockLen) || (pos + 32 > xsize) || (pos > xsize))
-    return 0;
-  if (memcmp (&xdata[pos], "LIST", 4) != 0)
-    return 0;
-  blockLen = fread_le (&xdata[pos + 4]);
-  if (memcmp (&xdata[pos + 8], "strlstrh", 8) != 0)
-    return 0;
-  if (memcmp (&xdata[pos + 20], "vids", 4) != 0)
-    return 0;
-  ret = 0;
-  /* pos + 24: video stream header */
-  memcpy (codec, &xdata[pos + 24], 4);
-  codec[4] = '\0';
-  snprintf (format,
-           sizeof(format),
-           _("codec: %s, %u fps, %u ms"), codec, fps, duration);
-  ADD (format, EXTRACTOR_METATYPE_FORMAT);
-  snprintf (format, 
-           sizeof(format), 
-           "%ux%u", width, height);
-  ADD (format, EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
-  ADD ("video/x-msvideo", EXTRACTOR_METATYPE_MIMETYPE);
- FINISH:
-  return ret;
-}

Copied: Extractor/src/plugins/riff_extractor.c (from rev 23273, 
Extractor/src/plugins/old/riff_extractor.c)
===================================================================
--- Extractor/src/plugins/riff_extractor.c                              (rev 0)
+++ Extractor/src/plugins/riff_extractor.c      2012-08-17 16:25:36 UTC (rev 
23289)
@@ -0,0 +1,157 @@
+/*
+     This file is part of libextractor.
+     (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+
+     This code was based on AVInfo 1.0 alpha 11
+     (c) George Shuklin, gs]AT[shounen.ru, 2002-2004
+     http://shounen.ru/soft/avinfo/
+
+     and bitcollider 0.6.0
+     (PD) 2004 The Bitzi Corporation
+     http://bitzi.com/
+ */
+/**
+ * @file plugins/riff_extractor.c
+ * @brief plugin to support RIFF files (ms-video)
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <math.h>
+
+
+/**
+ * Read an uint32_t as a little-endian (least
+ * significant byte first) integer from 'data'
+ *
+ * @param data input data
+ * @return integer read
+ */
+static uint32_t
+fread_le (const char *data)
+{
+  unsigned int x;
+  uint32_t result = 0;
+
+  for (x = 0; x < 4; x++)
+    result |= ((unsigned char) data[x]) << (x * 8);
+  return result;
+}
+
+
+/**
+ * We implement our own rounding function, because the availability of
+ * C99's round(), nearbyint(), rint(), etc. seems to be spotty, whereas
+ * floor() is available in math.h on all C compilers.
+ *
+ * @param num value to round
+ * @return rounded-to-nearest value
+ */
+static double
+round_double (double num)
+{
+  return floor (num + 0.5);
+}
+
+
+/**
+ * Pass the given UTF-8 string to the 'proc' callback using
+ * the given type.  Uses 'return' if 'proc' returns non-0.
+ *
+ * @param s 0-terminated UTF8 string value with the meta data
+ * @param t libextractor type for the meta data
+ */
+#define ADD(s,t) do { if (0 != ec->proc (ec->cls, "riff", t, 
EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen (s) + 1)) return; } while (0)
+
+
+/**
+ * Main entry method for the 'video/x-msvideo' extraction plugin.  
+ *
+ * @param ec extraction context provided to the plugin
+ */
+void 
+EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  ssize_t xsize;
+  void *data;
+  char *xdata;
+  uint32_t blockLen;
+  unsigned int fps;
+  unsigned int duration;
+  uint64_t pos;
+  uint32_t width;
+  uint32_t height;
+  char codec[5];
+  char format[256];
+  
+  /* read header */
+  if (72 > (xsize = ec->read (ec->cls, &data, 72)))
+    return;
+  xdata = data;
+  
+  /* check magic values */
+  if ( (0 != memcmp (&xdata[0],
+                    "RIFF", 4)) || 
+       (0 != memcmp (&xdata[8], "AVI ", 4)) ||
+       (0 != memcmp (&xdata[12], "LIST", 4)) ||
+       (0 != memcmp (&xdata[20], "hdrlavih", 8)) )
+    return;
+  
+  blockLen = fread_le (&xdata[28]);
+  
+  /* begin of AVI header at 32 */
+  fps = (unsigned int) round_double ((double) 1.0e6 / fread_le (&xdata[32]));
+  duration = (unsigned int) round_double ((double) fread_le (&xdata[48])
+                                          * 1000 / fps);
+  width = fread_le (&xdata[64]);
+  height = fread_le (&xdata[68]);
+
+  /* pos: begin of video stream header */
+  pos = blockLen + 32;
+
+  if (pos !=
+      ec->seek (ec->cls, pos, SEEK_SET))
+    return; 
+  if (32 > ec->read (ec->cls, &data, 32))
+    return;
+  xdata = data;
+
+  /* check magic */
+  if ( (0 != memcmp (xdata, "LIST", 4)) ||
+       (0 != memcmp (&xdata[8], "strlstrh", 8)) ||
+       (0 != memcmp (&xdata[20], "vids", 4)) )
+    return;
+
+  /* pos + 24: video stream header with codec */
+  memcpy (codec, &xdata[24], 4);
+  codec[4] = '\0';
+  snprintf (format,
+           sizeof (format),
+           _("codec: %s, %u fps, %u ms"), 
+           codec, fps, duration);
+  ADD (format, EXTRACTOR_METATYPE_FORMAT);
+  snprintf (format, 
+           sizeof (format), 
+           "%ux%u", 
+           (unsigned int) width,
+           (unsigned int) height);
+  ADD (format, EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
+  ADD ("video/x-msvideo", EXTRACTOR_METATYPE_MIMETYPE);
+}
+
+/* end of riff_extractor.c */

Added: Extractor/src/plugins/test_html.c
===================================================================
--- Extractor/src/plugins/test_html.c                           (rev 0)
+++ Extractor/src/plugins/test_html.c   2012-08-17 16:25:36 UTC (rev 23289)
@@ -0,0 +1,124 @@
+/*
+     This file is part of libextractor.
+     (C) 2012 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+*/
+/**
+ * @file plugins/test_html.c
+ * @brief testcase for html plugin
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "test_lib.h"
+
+
+/**
+ * Main function for the HTML testcase.
+ *
+ * @param argc number of arguments (ignored)
+ * @param argv arguments (ignored)
+ * @return 0 on success
+ */
+int
+main (int argc, char *argv[])
+{
+  struct SolutionData html_grothoff_sol[] =
+    {
+      { 
+       EXTRACTOR_METATYPE_TITLE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Christian Grothoff",
+       strlen ("Christian Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_DESCRIPTION,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Homepage of Christian Grothoff",
+       strlen ("Homepage of Christian Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_AUTHOR_NAME,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Christian Grothoff",
+       strlen ("Christian Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_KEYWORDS,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Christian,Grothoff",
+       strlen ("Christian,Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_TITLE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Welcome to Christian Grothoff",
+       strlen ("Welcome to Christian Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_LANGUAGE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "en",
+       strlen ("en") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_PUBLISHER,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Christian Grothoff",
+       strlen ("Christian Grothoff") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_UNKNOWN_DATE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "2000-08-20",
+       strlen ("2000-08-20") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_RIGHTS,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "(C) 2000 by Christian Grothoff",
+       strlen ("(C) 2000 by Christian Grothoff") + 1,
+       0 
+      },
+      { 0, 0, NULL, NULL, 0, -1 }
+    };
+  struct ProblemSet ps[] =
+    {
+      { "testdata/html_grothoff.html",
+       html_grothoff_sol },
+      { NULL, NULL }
+    };
+  return ET_main ("html", ps);
+}
+
+/* end of test_html.c */

Added: Extractor/src/plugins/test_man.c
===================================================================
--- Extractor/src/plugins/test_man.c                            (rev 0)
+++ Extractor/src/plugins/test_man.c    2012-08-17 16:25:36 UTC (rev 23289)
@@ -0,0 +1,85 @@
+/*
+     This file is part of libextractor.
+     (C) 2012 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+*/
+/**
+ * @file plugins/test_man.c
+ * @brief testcase for man plugin
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "test_lib.h"
+
+
+
+/**
+ * Main function for the MAN testcase.
+ *
+ * @param argc number of arguments (ignored)
+ * @param argv arguments (ignored)
+ * @return 0 on success
+ */
+int
+main (int argc, char *argv[])
+{
+  struct SolutionData man_extract_sol[] =
+    {
+      { 
+       EXTRACTOR_METATYPE_TITLE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "EXTRACT",
+       strlen ("EXTRACT") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_SECTION,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       _("Commands"),
+       strlen (_("Commands")) + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_MODIFICATION_DATE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       "Aug 7, 2012",
+       strlen ("Aug 7, 2012") + 1,
+       0 
+      },
+      { 
+       EXTRACTOR_METATYPE_SOURCE,
+       EXTRACTOR_METAFORMAT_UTF8,
+       "text/plain",
+       _("libextractor 0.7.0"),
+       strlen (_("libextractor 0.7.0")) + 1,
+       0 
+      },
+      { 0, 0, NULL, NULL, 0, -1 }
+    };
+  struct ProblemSet ps[] =
+    {
+      { "testdata/man_extract.1",
+       man_extract_sol },
+      { NULL, NULL }
+    };
+  return ET_main ("man", ps);
+}
+
+/* end of test_man.c */

Copied: Extractor/src/plugins/testdata/html_grothoff.html (from rev 23273, 
Extractor/test/test.html)
===================================================================
--- Extractor/src/plugins/testdata/html_grothoff.html                           
(rev 0)
+++ Extractor/src/plugins/testdata/html_grothoff.html   2012-08-17 16:25:36 UTC 
(rev 23289)
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN">
+<html lang="en">
+<head>
+<title>Christian Grothoff</title>
+<meta name="description" content="Homepage of Christian Grothoff">
+<meta name="author" content="Christian Grothoff">
+<meta name="keywords" content="Christian,Grothoff">
+<meta name="robots" content="index,follow">
+<meta name="revisit-after" content="28 days">
+<meta name="title" content="Welcome to Christian Grothoff">
+<meta name="content-language" content="en">
+<meta name="language" content="en">
+<meta name="publisher" content="Christian Grothoff">
+<meta name="date" content="2000-08-20">
+<meta name="rights" content="(C) 2000 by Christian Grothoff">
+<meta http-equiv="expires" content="43200">
+<meta http-equiv="content-type" content="text/html;CHARSET=iso8859-1">
+<meta http-equiv="Content-Style-Type" content="text/css">
+<link rel=stylesheet type="text/css" href="grothoff.css">
+<script language="JavaScript">
+<!--
+ if(top.frames.length > 0)
+ top.location.href=self.location;
+//-->
+</script>
+</head>
+<frameset cols="180,*" border=5 frameborder=5 framespacing=5 
bordercolor="#000000">
+<frame src="navigation.php3?currenttopic=Welcome" name="navigation">
+<frame src="welcome.php3" name="contentwindow">
+</frameset>
+<body>
+<h1>Welcome to Christian Grothoff</h1>
+<hr class="big">
+<br clear=all>
+<ul>
+<li><A href="welcome.php3">Welcome</A></li>
+<li><A href="cs/">Computer Science</A></li>
+<li><A href="linux/">Linux</A></li>
+<li><A href="http://www.stud.uni-wuppertal.de/~ma0035/";>Willkommen (my german 
homepage)</A></li>
+</ul>
+<hr>
+<A href="mailto:address@hidden";><em>address@hidden</em></A>
+</body>
+</html>

Added: Extractor/src/plugins/testdata/man_extract.1
===================================================================
--- Extractor/src/plugins/testdata/man_extract.1                                
(rev 0)
+++ Extractor/src/plugins/testdata/man_extract.1        2012-08-17 16:25:36 UTC 
(rev 23289)
@@ -0,0 +1,109 @@
+.TH EXTRACT 1 "Aug 7, 2012" "libextractor 0.7.0"
+.\" $Id
+.SH NAME
+extract
+\- determine meta-information about a file
+.SH SYNOPSIS
+.B extract
+[
+.B \-bgihLmnvV
+]
+[
+.B \-l
+.I library
+]
+[
+.B \-p
+.I type
+]
+[
+.B \-x
+.I type
+]
+.I file
+\&...
+.br
+.SH DESCRIPTION
+This manual page documents version 0.7.0 of the
+.B extract
+command.
+.PP
+.B extract
+tests each file specified in the argument list in an attempt to infer 
meta\-information from it.  Each file is subjected to the meta\-data extraction 
libraries from
+.I libextractor.
+.PP
+libextractor classifies meta\-information (also referred to as keywords) into 
types. A list of all types can be obtained with the
+.B \-L
+option.
+
+.SH OPTIONS
+.TP 8
+.B \-b
+Display the output in BiBTeX format.
+.TP 8
+.B \-g
+Use grep\-friendly output (all keywords on a single line for each file).  Use 
the verbose option to print the filename first, followed by the keywords.  Use 
the verbose option twice to also display the keyword types.  This option will 
not print keyword types or non\-textual metadata.
+.TP 8
+.B \-h
+Print a brief summary of the options.
+.TP 8
+.B \-i
+Run plugins in\-process (for debugging).  By default, each plugin is run in 
its own process.
+.TP 8
+.BI \-l " libraries"
+Use the specified libraries to extract keywords. The general format of 
libraries is .I [[\-]LIBRARYNAME[:[\-]LIBRARYNAME]*] where LIBRARYNAME is a 
libextractor compatible library and typically of the form .Ijpeg\. The minus 
before the libraryname indicates that this library should be removed from the 
existing list.  To run only a few selected plugins, use \-l in combination with 
\-n.
+.TP 8
+.B \-L
+Print a list of all known keyword types.
+.TP 8
+.B \-m
+Load the file into memory and perform extraction from memory (for debugging).
+.TP 8
+.B \-n
+Do not use the default set of extractors (typically all standard extractors, 
currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime\-types), use 
only the extractors specified with the .B \-l option.
+.TP
+.B \-p " type"
+Print only the keywords matching the specified type. By default, all keywords 
that are found and not removed as duplicates are printed.
+.TP 8
+.B \-v
+Print the version number and exit.
+.TP 8
+.B \-V
+Be verbose.  This option can be specified multiple times to increase verbosity 
further.
+.TP 8
+.I \-x " type"
+Exclude keywords of the specified type from the output. By default, all 
keywords that are found and not removed as duplicates are printed.
+.SH SEE ALSO
+.BR libextractor (3)
+\- description of the libextractor library
+.br
+.SH EXAMPLES
+.nf
+$ extract test/test.jpg
+comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
+mimetype \- image/jpeg
+
+$ extract \-V \-x comment test/test.jpg
+Keywords for file test/test.jpg:
+mimetype \- image/jpeg
+
+$ extract \-p comment test/test.jpg
+comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
+
+$ extract \-nV \-l png.so \-p comment test/test.jpg test/test.png
+Keywords for file test/test.jpg:
+Keywords for file test/test.png:
+comment \- Testing keyword extraction
+
+.SH LEGAL NOTICE
+libextractor and the extract tool are released under the GPL.  libextractor is 
a GNU package.
+
+.SH BUGS
+A couple of file\-formats (on the order of 10^3) are not recognized...
+
+.SH AUTHORS
+.B extract
+was originally written by Christian Grothoff <address@hidden> and Vidyut 
Samanta <address@hidden>. Use <address@hidden> to contact the current 
maintainer(s).
+
+.SH AVAILABILITY
+You can obtain the original author's latest version from 
http://www.gnu.org/software/libextractor/

Modified: Extractor/src/plugins/thumbnailgtk_extractor.c
===================================================================
--- Extractor/src/plugins/thumbnailgtk_extractor.c      2012-08-17 14:54:29 UTC 
(rev 23288)
+++ Extractor/src/plugins/thumbnailgtk_extractor.c      2012-08-17 16:25:36 UTC 
(rev 23289)
@@ -74,10 +74,9 @@
   void *buf;
   const char *mime;
 
-  iret = ec->read (ec->cls,
-                  &data,
-                  16 * 1024);
-  if (-1 == iret)
+  if (-1 == (iret = ec->read (ec->cls,
+                             &data,
+                             16 * 1024)))
     return;
   if (NULL == (mime = magic_buffer (magic, data, iret)))
     return;

Deleted: Extractor/test/test.html
===================================================================
--- Extractor/test/test.html    2012-08-17 14:54:29 UTC (rev 23288)
+++ Extractor/test/test.html    2012-08-17 16:25:36 UTC (rev 23289)
@@ -1,44 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN">
-<html lang="en">
-<head>
-<title>Christian Grothoff</title>
-<meta name="description" content="Homepage of Christian Grothoff">
-<meta name="author" content="Christian Grothoff">
-<meta name="keywords" content="Christian,Grothoff">
-<meta name="robots" content="index,follow">
-<meta name="revisit-after" content="28 days">
-<meta name="title" content="Welcome to Christian Grothoff">
-<meta name="content-language" content="en">
-<meta name="language" content="en">
-<meta name="publisher" content="Christian Grothoff">
-<meta name="date" content="2000-08-20">
-<meta name="rights" content="(C) 2000 by Christian Grothoff">
-<meta http-equiv="expires" content="43200">
-<meta http-equiv="content-type" content="text/html;CHARSET=iso8859-1">
-<meta http-equiv="Content-Style-Type" content="text/css">
-<link rel=stylesheet type="text/css" href="grothoff.css">
-<script language="JavaScript">
-<!--
- if(top.frames.length > 0)
- top.location.href=self.location;
-//-->
-</script>
-</head>
-<frameset cols="180,*" border=5 frameborder=5 framespacing=5 
bordercolor="#000000">
-<frame src="navigation.php3?currenttopic=Welcome" name="navigation">
-<frame src="welcome.php3" name="contentwindow">
-</frameset>
-<body>
-<h1>Welcome to Christian Grothoff</h1>
-<hr class="big">
-<br clear=all>
-<ul>
-<li><A href="welcome.php3">Welcome</A></li>
-<li><A href="cs/">Computer Science</A></li>
-<li><A href="linux/">Linux</A></li>
-<li><A href="http://www.stud.uni-wuppertal.de/~ma0035/";>Willkommen (my german 
homepage)</A></li>
-</ul>
-<hr>
-<A href="mailto:address@hidden";><em>address@hidden</em></A>
-</body>
-</html>




reply via email to

[Prev in Thread] Current Thread [Next in Thread]