[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r2921 - Extractor-python
From: |
grothoff |
Subject: |
[GNUnet-SVN] r2921 - Extractor-python |
Date: |
Sun, 28 May 2006 11:26:45 -0700 (PDT) |
Author: grothoff
Date: 2006-05-28 11:26:44 -0700 (Sun, 28 May 2006)
New Revision: 2921
Added:
Extractor-python/extractor.py
Modified:
Extractor-python/README
Log:
sync
Modified: Extractor-python/README
===================================================================
--- Extractor-python/README 2006-05-28 18:07:48 UTC (rev 2920)
+++ Extractor-python/README 2006-05-28 18:26:44 UTC (rev 2921)
@@ -1 +1,11 @@
This is the python binding for libextractor.
+
+Actually, there are two bindings in this package.
+One written by Heiko Wundram (abandoned) and
+another one drafted by Bader Lejmi (extractor.py).
+
+Also note that there is a third binding for both
+libextractor and doodle by nokos, available at
+http://gnunet.org/libextractor/download/
+
+I have no idea which is best at this time.
Added: Extractor-python/extractor.py
===================================================================
--- Extractor-python/extractor.py 2006-05-28 18:07:48 UTC (rev 2920)
+++ Extractor-python/extractor.py 2006-05-28 18:26:44 UTC (rev 2921)
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+## Python bindings for GNU libextractor
+##
+## Copyright (C) 2006 Bader Ladjemi <address@hidden>
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; see the file COPYING. If not, write to the
+## Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+## USA.
+##
+"""
+Python bindings for GNU libextractor
+
+libextractor is a simple library for keyword extraction. libextractor
+does not support all formats but supports a simple plugging mechanism
+such that you can quickly add extractors for additional formats, even
+without recompiling libextractor. libextractor typically ships with a
+dozen helper-libraries that can be used to obtain keywords from common
+file-types.
+
+libextractor is a part of the GNU project (http://www.gnu.org/).
+"""
+from ctypes import *
+#fake cdll import
+libextractor = cdll.extractor
+
+__all__ = ['Extractor', 'isBinaryType']
+__version__ = "0.5"
+__licence__ = "GNU GPL"
+
+EXTRACTOR_ENCODING = "utf-8"
+
+KeywordType = c_int
+Keywords_p = POINTER('Keywords')
+class Keywords(Structure):
+ """
+ EXTRACTOR_Keywords struct
+ """
+ _fields_ = [('keyword', c_char_p),
+ ('keywordType', KeywordType),
+ ('next', Keywords_p)]
+SetPointerType(Keywords_p, Keywords)
+
+KEYWORDS = POINTER(Keywords)
+
+libextractor.EXTRACTOR_getKeywords.restype = KEYWORDS
+libextractor.EXTRACTOR_getKeywords2.restype = KEYWORDS
+libextractor.EXTRACTOR_removeDuplicateKeywords.restype = KEYWORDS
+libextractor.EXTRACTOR_getKeywordTypeAsString.restype = c_char_p
+
+## Extractors_p = POINTER('Extractors')
+## ExtractMethod = CFUNCTYPE(Keywords, c_char_p, c_int, Keywords, c_char_p)
+## class Extractors(Structure):
+## """
+## EXTRACTOR_Extractor struct
+## """
+## _field_ = [('libraryHandle', c_void_p),
+## ('libname', c_char_p),
+## ('extractMethod', ExtractMethod),
+## ('next', Extractors_p),
+## ('options', c_char_p)]
+## SetPointerType(Extractors_p, Extractors)
+
+## EXTRACTORS = POINTER(Extractors)
+
+## libextractor.EXTRACTOR_loadDefaultLibraries.restype = EXTRACTORS
+## libextractor.EXTRACTOR_loadConfigLibraries.restype = EXTRACTORS
+## libextractor.EXTRACTOR_addLibrary.restype = EXTRACTORS
+## libextractor.EXTRACTOR_addLibraryLast.restype = EXTRACTORS
+
+libextractor.EXTRACTOR_getDefaultLibraries.restype = c_char_p
+
+class Extractor(object):
+ """
+ Main class for extracting meta-data with GNU libextractor.
+
+ You may create multiple instances of Extractor to use
+ different sets of library. Initially each Extractor
+ will start with the default set of libraries.
+
+ Use the extract method to obtain keywords from a file.
+
+ Use the add and remove libraries methods to change the list of
+ libraries that should be used.
+ """
+
+ def __init__(self, defaults=True, libraries=None, lang=None,
languages=None, hash=None, use_filename=False, duplicates=True,
split_keywords=False):
+ """
+ Initialize Extractor's instance
+ @param extractors list of strings that contains extractor's name
(supported types)
+ @param defaults load default plugins
+ @param lang use the generic plaintext extractor for the language with
the 2-letter language code LANG
+ @param languages list of lang
+ @param hash compute hash using the given algorithm (currently 'sha1' or
'md5')
+ @param use_filename use the filename as a keyword (add
filename-extractor library)
+ @param duplicates remove duplicates only if types match
+ @param split_keywords use keyword splitting (add split-extractor
library)
+ """
+ self._libraries = {}
+
+ if defaults:
+ self.extractors = libextractor.EXTRACTOR_loadDefaultLibraries()
+ self._libraries = dict([(library, None) for library in
libextractor.EXTRACTOR_getDefaultLibraries().split(':')])
+ if use_filename:
+ self.addLibrary("libextractor_filename")
+ if libraries:
+ self.extractors =
libextractor.EXTRACTOR_loadConfigLibraries(self.extractors, libraries)
+ self._libraries.update(dict([(library, None) for library in
libraries.split(':')]))
+ if isinstance(lang, str):
+ self.addLibraryLast("libextractor_printable_" % lang)
+ if isinstance(hash, str):
+ self.addLibraryLast("libextractor_hash_" % hash)
+ if languages:
+ [self.addLibraryLast("libextractor_printable_" % language) for
language in languages]
+ if split_keywords:
+ self.addLibraryLast("libextractor_split")
+
+ self.duplicates = duplicates
+
+ def extract(self, filename=None, data=None, size=None):
+ """Pass a filename, or data and size, to extract keywords.
+
+ @param filename filename string
+ @param data data contents
+ @param size data size
+
+ This function returns a dictionary. Its keys are keywords types
+ and its values are keywords values. If the file cannot be opened
+ or cannot be found, the dictionary will be empty. The list can
+ also be empty if no dictionary was found for the file.
+
+ """
+ if not filename and not (data and size):
+ return None
+ elif filename:
+ return self.extractFile(filename)
+ else:
+ return self.extractData(data, size)
+
+ def extractFile(self, filename):
+ """Pass a filename to extract keywords.
+
+ @param filename filename string
+
+ This function returns a dictionary. Its keys are keywords types
+ and its values are keywords values. If the file cannot be opened
+ or cannot be found, the dictionary will be empty. The list can
+ also be empty if no dictionary was found for the file.
+
+ """
+ self.keywords_p = libextractor.EXTRACTOR_getKeywords(self.extractors,
filename)
+ return self._extract()
+
+ def extractData(self, data, size):
+ """Pass data to extract keywords.
+
+ @param data data contents
+ @param size data size
+
+ This function returns a dictionary. Its keys are keywords types
+ and its values are keywords values. If the file cannot be opened
+ or cannot be found, the dictionary will be empty. The list can
+ also be empty if no dictionary was found for the file.
+
+ """
+ self.keywords_p = libextractor.EXTRACTOR_getKeywords2(self.extractors,
data, size)
+ return self._extract()
+
+ def _extract(self):
+ if not self.keywords_p:
+ return None
+
+ if self.duplicates:
+ self.keywords_p =
libextractor.EXTRACTOR_removeDuplicateKeywords(self.keywords_p, 1)
+
+ self.extracted = {}
+ try:
+ self.keywords = self.keywords_p.contents
+ except ValueError:
+ return self.extracted
+
+ while True:
+ keyword_type =
libextractor.EXTRACTOR_getKeywordTypeAsString(self.keywords.keywordType).decode(EXTRACTOR_ENCODING)
+ keyword = self.keywords.keyword
+
+ if not isBinaryType(self.keywords.keywordType):
+ keyword = keyword.decode(EXTRACTOR_ENCODING)
+
+ self.extracted[keyword_type] = keyword
+ try:
+ self.keywords = self.keywords.next.contents
+ except ValueError:
+ libextractor.EXTRACTOR_freeKeywords(self.keywords_p)
+ return self.extracted
+
+ def addLibrary(self, library):
+ """
+ Add given library to the extractor. Invoke with a string with the name
+ of the library that should be added. For example,
+
+ 'libextractor_filename'
+
+ will prepend the extractor that just adds the filename as a
+ keyword.
+
+ No errors are reported if the library is not
+ found.
+
+ @param library library's name
+ """
+ self._libraries[library] = None
+
+ self.extractors = libextractor.EXTRACTOR_addLibrary(self.extractors,
library)
+
+ def addLibraryLast(self, library):
+ """
+ Same as addLibrary but the library is added at the last.
+
+ @param library library's name
+ """
+ self._libraries[library] = None
+
+ self.extractors =
libextractor.EXTRACTOR_addLibraryLast(self.extractors, library)
+
+ def removeLibrary(self, library):
+ """
+ Remove a library. Pass the name of the library that is to
+ be removed. Only one library can be removed at a time.
+ For example,
+
+ 'libextractor_pdf'
+
+ removes the PDF extractor (if added).
+ ValueError will be thrown if no library match.
+
+ @param library's name
+ """
+ try:
+ del self._libraries[library]
+ except KeyError:
+ raise ValueError, "No such loaded library"
+
+ self.extractors = libextractor.EXTRACTOR_removeLibrary(self.extractors,
library)
+
+ def addLibraries(self, libraries):
+ """
+ Add given libraries.
+ Same as addLibary but libraries is a list of library's names.
+
+ @param libraries list of libraries names
+ """
+ for library in libraries:
+ if isinstance(library, str):
+ self.addLibrary(library)
+
+ def removeAllLibraries(self):
+ """
+ Remove all libraries.
+ """
+ self._libaries = {}
+ libextractor.EXTRACTOR_removeAll(self.extractors)
+
+ def keywordTypes(self):
+ """
+ Returns the list of all keywords types.
+ @return list of all keywords types
+ """
+ i = 0
+ keyword_types = []
+
+ while True:
+ keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(i)
+ if not keyword_type:
+ break
+ keyword_types.append(keyword_type)
+ i += 1
+
+ return keyword_types
+
+ def _get_libraries(self):
+ return self._libraries.keys()
+
+ def _set_libraries(self, libraries):
+ self.addLibraries(libraries)
+
+ libraries = property(fget=_get_libraries, fset=_set_libraries,
fdel=removeAllLibraries, doc='list of loaded libraries (read only)')
+
+
+ def __delete__(self):
+ self.removeAllLibraries()
+
+EXTRACTOR_THUMBNAIL_DATA = 70
+def isBinaryType(keyword_type):
+ return keyword_type == EXTRACTOR_THUMBNAIL_DATA
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r2921 - Extractor-python,
grothoff <=