[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r9769 - in Extractor: . src/include src/main src/plugins
From: |
gnunet |
Subject: |
[GNUnet-SVN] r9769 - in Extractor: . src/include src/main src/plugins |
Date: |
Tue, 15 Dec 2009 17:40:48 +0100 |
Author: grothoff
Date: 2009-12-15 17:40:48 +0100 (Tue, 15 Dec 2009)
New Revision: 9769
Added:
Extractor/src/plugins/pdf_extractor.cc
Removed:
Extractor/src/plugins/pdf/
Modified:
Extractor/configure.ac
Extractor/src/include/extractor.h
Extractor/src/main/extractor_metatypes.c
Extractor/src/plugins/Makefile.am
Extractor/src/plugins/dvi_extractor.c
Log:
pdf
Modified: Extractor/configure.ac
===================================================================
--- Extractor/configure.ac 2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/configure.ac 2009-12-15 16:40:48 UTC (rev 9769)
@@ -217,7 +217,14 @@
AM_CONDITIONAL(HAVE_MPEG2, false))],
AM_CONDITIONAL(HAVE_MPEG2, false))
+AC_CHECK_LIB(poppler, _ZTI9MemStream,
+ [AC_CHECK_HEADERS([poppler/goo/gmem.h],
+ AM_CONDITIONAL(HAVE_POPPLER, true)
+ AC_DEFINE(HAVE_POPPLER,1,[Have libpoppler]),
+ AM_CONDITIONAL(HAVE_POPPLER, false))],
+ AM_CONDITIONAL(HAVE_POPPLER, false))
+
# restore LIBS
LIBS=$LIBSOLD
@@ -309,7 +316,6 @@
fi
# check for all C++ dependencies...
-xpdf=0
exiv2=0
qt=0
qt4=0
@@ -398,21 +404,6 @@
EXT_LIB_PATH="-L$with_qt/lib $EXT_LIB_PATH"
qt_svg=1)))])
-AC_MSG_CHECKING([whether to enable xpdf-based extractor])
-AC_ARG_ENABLE(xpdf,
- [AC_HELP_STRING([--enable-xpdf],[Enable xpdf-based extractor])
- AC_HELP_STRING([--disable-xpdf],[Disable xpdf-based extractor])],
- [case "$enableval" in
- no) AC_MSG_RESULT(no)
- xpdf=0
- ;;
- *) AC_MSG_RESULT(yes)
- xpdf=1
- ;;
- esac],
- [ AC_MSG_RESULT(no)
- xpdf=0])
-
exiv2=1
AC_MSG_CHECKING([whether to enable exiv2 extractor])
AC_ARG_ENABLE(exiv2,
@@ -455,8 +446,6 @@
AM_CONDITIONAL(HAVE_QT_SVG, test x$qt_svg != x0)
AM_CONDITIONAL(HAVE_QT_SVG4, test x$qt_svg4 != x0)
-AM_CONDITIONAL(HAVE_XPDF, test x$xpdf != x0)
-
AM_CONDITIONAL(HAVE_EXIV2, test x$exiv2 != x0)
AC_DEFINE_UNQUOTED([HAVE_EXIV2], $exiv2, [We use EXIV2])
@@ -569,7 +558,6 @@
src/plugins/Makefile
src/plugins/ole2/Makefile
src/plugins/oo/Makefile
-src/plugins/pdf/Makefile
src/plugins/printable/Makefile
src/plugins/hash/Makefile
src/plugins/thumbnail/Makefile
@@ -627,13 +615,6 @@
AC_OUTPUT
-if test "x$xpdf" = "x1"
-then
- AC_MSG_NOTICE([NOTICE: xpdf enabled (xpdf has a bad security record)])
-else
- AC_MSG_NOTICE([NOTICE: xpdf disabled (result: limited PDF support)])
-fi
-
if test "x$exiv2" = "x0"
then
AC_MSG_NOTICE([NOTICE: exiv2 disabled])
@@ -679,6 +660,11 @@
AC_MSG_NOTICE([NOTICE: libmpeg2 not found (will not compile mpeg2 plugin)])
fi
+if test "x$HAVE_POPPLER_TRUE" = "x#"
+then
+ AC_MSG_NOTICE([NOTICE: libpoppler not found (will not compile pdf plugin)])
+fi
+
if test "x$HAVE_CXX" != "xyes"
then
AC_MSG_NOTICE([NOTICE: no C++ compiler found (not compiling plugins that
require C++)])
Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h 2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/src/include/extractor.h 2009-12-15 16:40:48 UTC (rev 9769)
@@ -237,6 +237,10 @@
/* image specifics */
EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112,
+
+
+ EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113,
+
/* fixme: used up to here! */
EXTRACTOR_METATYPE_THUMBNAIL_DATA = 70,
Modified: Extractor/src/main/extractor_metatypes.c
===================================================================
--- Extractor/src/main/extractor_metatypes.c 2009-12-15 16:35:34 UTC (rev
9768)
+++ Extractor/src/main/extractor_metatypes.c 2009-12-15 16:40:48 UTC (rev
9769)
@@ -282,10 +282,13 @@
gettext_noop ("") },
{ gettext_noop ("image dimensions"),
gettext_noop ("") },
+ { gettext_noop ("produced by software"),
+ gettext_noop ("") }, /* what is the exact difference between the software
+ creator and the software producer? PDF and DVI
+ both have this distinction (i.e., Writer vs.
+ OpenOffice) */
{ gettext_noop (""),
gettext_noop ("") },
- { gettext_noop (""),
- gettext_noop ("") },
#if 0
gettext_noop("author"),
Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am 2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/src/plugins/Makefile.am 2009-12-15 16:40:48 UTC (rev 9769)
@@ -46,22 +46,19 @@
if HAVE_EXIV2
exiv2=libextractor_exiv2.la
endif
+if HAVE_POPPLER
+ pdf=libextractor_pdf.la
endif
+endif
-if HAVE_XPDF
- xpdfdir=pdf
-else
- pdfplugin=libextractor_pdf.la
-endif
-
if HAVE_MPEG2
extrampeg = libextractor_mpeg.la
endif
# toggle for development
SUBDIRS = .
-# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
$(xpdfdir)
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
if HAVE_VORBISFILE
@@ -95,6 +92,7 @@
libextractor_html.la \
libextractor_it.la \
libextractor_mime.la \
+ $(pdf) \
$(rpm)
libextractor_applefile_la_SOURCES = \
@@ -148,6 +146,19 @@
libextractor_it_la_LDFLAGS = \
$(PLUGINFLAGS)
+libextractor_mime_la_SOURCES = \
+ mime_extractor.c
+libextractor_mime_la_LDFLAGS = \
+ $(PLUGINFLAGS)
+
+libextractor_pdf_la_SOURCES = \
+ pdf_extractor.cc
+libextractor_pdf_la_LDFLAGS = \
+ $(XTRA_CPPLIBS) $(PLUGINFLAGS)
+libextractor_pdf_la_LIBADD = \
+ $(top_builddir)/src/common/libextractor_common.la \
+ -lpoppler
+
libextractor_rpm_la_SOURCES = \
rpm_extractor.c
libextractor_rpm_la_LDFLAGS = \
@@ -155,10 +166,6 @@
libextractor_rpm_la_LIBADD = \
-lrpm
-libextractor_mime_la_SOURCES = \
- mime_extractor.c
-libextractor_mime_la_LDFLAGS = \
- $(PLUGINFLAGS)
@@ -236,13 +243,6 @@
libextractor_wav_la_LIBADD = \
$(LE_LIBINTL)
-libextractor_pdf_la_SOURCES = \
- pdfextractor.c
-libextractor_pdf_la_LDFLAGS = \
- $(PLUGINFLAGS)
-libextractor_pdf_la_LIBADD = \
- $(top_builddir)/src/common/libextractor_common.la
-
libextractor_mp3_la_SOURCES = \
mp3extractor.c
libextractor_mp3_la_LDFLAGS = \
Modified: Extractor/src/plugins/dvi_extractor.c
===================================================================
--- Extractor/src/plugins/dvi_extractor.c 2009-12-15 16:35:34 UTC (rev
9768)
+++ Extractor/src/plugins/dvi_extractor.c 2009-12-15 16:40:48 UTC (rev
9769)
@@ -33,7 +33,7 @@
{"/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME},
{"/Keywords (", EXTRACTOR_METATYPE_KEYWORDS},
{"/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
- {"/Producer (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE},
+ {"/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
{NULL, 0},
};
Added: Extractor/src/plugins/pdf_extractor.cc
===================================================================
--- Extractor/src/plugins/pdf_extractor.cc (rev 0)
+++ Extractor/src/plugins/pdf_extractor.cc 2009-12-15 16:40:48 UTC (rev
9769)
@@ -0,0 +1,216 @@
+/*
+ This file is part of libextractor.
+ (C) 2002, 2003 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ This code was inspired by pdfinfo and depends heavily
+ on the xpdf code that pdfinfo is a part of. See also
+ the INFO file in this directory.
+ */
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+#include <math.h>
+
+#include <poppler/goo/gmem.h>
+#include <poppler/Object.h>
+#include <poppler/Stream.h>
+#include <poppler/Array.h>
+#include <poppler/Dict.h>
+#include <poppler/XRef.h>
+#include <poppler/Catalog.h>
+#include <poppler/Page.h>
+#include <poppler/PDFDoc.h>
+#include <poppler/Error.h>
+#include <poppler/goo/GooString.h>
+
+#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type,
EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT;
}} while (0)
+
+static int
+printInfoString(Dict *infoDict,
+ const char *key,
+ enum EXTRACTOR_MetaType type,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ Object obj;
+ GooString *s1;
+ const char * s;
+ char *ckey = strdup (key);
+ int err = 0;
+ char * result;
+
+ result = NULL;
+ if (infoDict->lookup(ckey, &obj)->isString()) {
+ s1 = obj.getString();
+ s = s1->getCString();
+ if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
+ (((unsigned char)s[1]) & 0xff) == 0xff) {
+ result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2,
"UTF-16BE");
+ ADD (result, type);
+ } else {
+ size_t len = strlen(s);
+
+ while(0 < len) {
+ /*
+ * Avoid outputting trailing spaces.
+ *
+ * The following expression might be rewritten as
+ * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]).
+ * There seem to exist isspace() implementations
+ * which do return non-zero from NBSP (maybe locale-dependent).
+ * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from
+ * the expression if it looks suspicious (locale issues for instance).
+ *
+ * Squeezing out all non-printable characters might also be useful.
+ */
+ if ( (' ' != s[len - 1]) && ((char)0xA0 != s[len - 1]) &&
+ ('\r' != s[len - 1]) && ('\n' != s[len - 1]) &&
+ ('\t' != s[len - 1]) && ('\v' != s[len - 1]) &&
+ ('\f' != s[len - 1]) )
+ break;
+
+ else
+ len --;
+ }
+
+ /* there should be a check to truncate preposterously long values. */
+
+ if (0 < len) {
+ result = EXTRACTOR_common_convert_to_utf8(s, len,
+ "ISO-8859-1");
+ ADD (result, type);
+ }
+ }
+ }
+ EXIT:
+ obj.free();
+ free (result);
+ free (ckey);
+ return err;
+}
+
+static int
+printInfoDate(Dict *infoDict,
+ const char *key,
+ enum EXTRACTOR_MetaType type,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ Object obj;
+ const char *s;
+ GooString *s1;
+ char *gkey;
+ char * result;
+ int err;
+
+ err = 0;
+ result = NULL;
+ gkey = strdup (key);
+ if (infoDict->lookup(gkey, &obj)->isString()) {
+ s1 = obj.getString();
+ s = s1->getCString();
+
+ if ((s1->getChar(0) & 0xff) == 0xfe &&
+ (s1->getChar(1) & 0xff) == 0xff) {
+ /* isUnicode */
+
+ result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2],
s1->getLength() - 2, "UTF-16BE");
+ ADD (result, type);
+ } else {
+ if (s[0] == 'D' && s[1] == ':')
+ s += 2;
+
+ ADD (s, type);
+ }
+ /* printf(fmt, s);*/
+ }
+ EXIT:
+ obj.free();
+ free (result);
+ free (gkey);
+ return err;
+}
+
+#define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t,
proc, proc_cls))) goto EXIT; } while (0)
+
+#define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t,
proc, proc_cls))) goto EXIT; } while (0)
+
+extern "C" {
+
+
+ int
+ EXTRACTOR_pdf_extract (const char *data,
+ size_t size,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls,
+ const char *options)
+ {
+ PDFDoc * doc;
+ Object info;
+ Object obj;
+ BaseStream * stream;
+ int err;
+
+ /* errorInit(); -- keep commented out, otherwise errors are printed to
stderr for non-pdf files! */
+ obj.initNull();
+ err = 0;
+ stream = new MemStream( (char*) data, 0, size, &obj);
+ doc = new PDFDoc(stream, NULL, NULL);
+ if (! doc->isOk()) {
+ delete doc;
+ return 0;
+ }
+
+ ADD ("application/pdf",
+ EXTRACTOR_METATYPE_MIMETYPE);
+ if ( (NULL != doc->getDocInfo(&info)) &&
+ (info.isDict()) ) {
+ PIS ("Title", EXTRACTOR_METATYPE_TITLE);
+ PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT);
+ PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS);
+ PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME);
+ /*
+ * we now believe that Adobe's Creator is not a person nor an
+ * organisation, but just a piece of software.
+ */
+ PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE);
+ PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE);
+ {
+ char pcnt[20];
+ sprintf(pcnt, "%d", doc->getNumPages());
+ ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT);
+ }
+ {
+ char pcnt[64];
+ sprintf(pcnt, "PDF %d.%d",
+ doc->getPDFMajorVersion(),
+ doc->getPDFMinorVersion());
+ ADD (pcnt, EXTRACTOR_METATYPE_FORMAT);
+ }
+ PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE);
+ PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE);
+ }
+ EXIT:
+ info.free();
+ delete doc;
+
+ return err;
+ }
+}
+
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r9769 - in Extractor: . src/include src/main src/plugins,
gnunet <=