package org.paris5.cocoon.transformation; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Map; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.context.Context; import org.apache.avalon.framework.context.ContextException; import org.apache.avalon.framework.context.Contextualizable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.cocoon.Constants; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.search.LuceneCocoonHelper; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.transformation.AbstractTransformer; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.source.impl.validity.NOPValidity; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * A lucene index creation transformer. * @author Nicolas Maisoneuve *

Example of input source:

<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
< lucene:index create="true"
analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"
directory="d:/indexbase"
merge-factor="merge-factor">
<lucene:document>

<lucene:field name="tile" type="keyword">sqdqsdq</lucene:field>
<lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field>
<lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field>

(see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)


<lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field>
<lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field>
</lucene:document>

<lucene:document>
<lucene:field name="author" type="keyword" boost="2">Mr Author</lucene:field>

(boost the field for the search (see Lucene documentation))

<lucene:field name="langage" type="keyword">french</lucene:field>
</lucene:document>
< /lucene:index>

<lucene:delete directory="d:/indexbase" >
<lucene:document field="author" value="Mr Author"/> (delete all documents with the field author ="Mr Author")
<lucene:document field="id" value="1E3RFE"/>
< /lucene:delete>

</page>

Example of Output Source

<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
< lucene:index nbdocuments="2"/>
< lucene:delete nbdocuments="1"/>
</page>

*/ public class LuceneIndexTransformer extends AbstractTransformer implements Disposable, CacheableProcessingComponent, Recyclable, Configurable, Contextualizable { public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname"; public static final String DIRECTORY_CONFIG = "directory"; public static final String DIRECTORY_PARAMETER = "directory"; public static final String MERGE_FACTOR_CONFIG = "merge-factor"; public static final String MERGE_FACTOR_PARAMETER = "merge-factor"; public static final String DIRECTORY_DEFAULT = "index"; public static final int MERGE_FACTOR_DEFAULT = 20; public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; public static final String LUCENE_QUERY_ELEMENT = "index"; public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer"; public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory"; public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create"; public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor"; public static final String LUCENE_DELETE_ELEMENT = "delete"; public static final String DOCUMENT_NAME_ATTRIBUTE = "name"; public static final String DOCUMENT_VALUE_ATTRIBUTE = "value"; public static final String LUCENE_DOCUMENT_ELEMENT = "document"; public static final String LUCENE_DOCUMENT_FIELD_ATTRIBUTE = "field"; public static final String LUCENE_DOCUMENT_VALUE_ATTRIBUTE = "value"; public static final String LUCENE_FIELD_ELEMENT = "field"; public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name"; public static final String LUCENE_FIELD_TYPE_ATTRIBUTE = "type"; public static final String LUCENE_FIELD_DATEFORMAT_ATTRIBUTE = "dateformat"; public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost"; public static final int TYPE_KEYWORD = 1; public static final int TYPE_TEXT = 2; public static final int TYPE_DATE = 3; public static final int TYPE_UNSTORED = 4; public static final int TYPE_UNINDEXED = 5; public static final int ADD_ACTION = 1; public static final int DELETE_ACTION = 2; // Initialization time variables protected ComponentManager manager = null; protected File workDir = null; protected int nbdocuments; protected int action; // Declaration time parameters values private String analyzerClassnameDefault; private String directoryDefault; private int mergeFactorDefault; // Invocation time parameters values private String analyzerClassname; private String directory; private int mergeFactor; // Runtime variables private int processing; private IndexWriter writer; private IndexReader reader; private Term term; private Document bodyDocument; private String fieldname; private int fieldtype; private float fieldboost; private String fieldvalue; private SimpleDateFormat df; private static String uid(String url) { return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified()); } public void configure(Configuration conf) throws ConfigurationException { this.analyzerClassnameDefault = conf.getChild(ANALYZER_CLASSNAME_CONFIG) .getValue(ANALYZER_CLASSNAME_DEFAULT); this.mergeFactorDefault = conf.getChild(MERGE_FACTOR_CONFIG) .getValueAsInteger(MERGE_FACTOR_DEFAULT); this.directoryDefault = conf.getChild(DIRECTORY_CONFIG) .getValue(DIRECTORY_DEFAULT); } /** * Setup the transformer. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { // We don't need all this stuff this.analyzerClassname = parameters.getParameter( ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault); this.directory = parameters.getParameter(DIRECTORY_PARAMETER, directoryDefault); this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, mergeFactorDefault); } public void compose(ComponentManager manager) throws ComponentException { this.manager = manager; } /** * Contextualize this class */ public void contextualize(Context context) throws ContextException { this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR); } public void recycle() { this.processing = 0; if (this.writer != null) { try { this.writer.close(); } catch (IOException ioe) {this.getLogger().error("Recycle Error writer ");} this.writer = null; } if (this.reader != null) { try { this.reader.close(); } catch (IOException ioe) {this.getLogger().error("Recycle Error reader ");} this.reader = null; } this.bodyDocument = null; } public void dispose() { } /** * Generate the unique key. * This key must be unique inside the space of this component. * * @return The generated key */ public Serializable getKey() { return "1"; } /** * Generate the validity object. * * @return The generated validity object or null if the * component is currently not cacheable. */ public SourceValidity getValidity() { return NOPValidity.SHARED_INSTANCE; } public void startDocument() throws SAXException { super.startDocument(); } public void endDocument() throws SAXException { super.endDocument(); } /** * Begin the scope of a prefix-URI Namespace mapping. * * @param prefix The Namespace prefix being declared. * @param uri The Namespace URI the prefix is mapped to. */ public void startPrefixMapping(String prefix, String uri) throws SAXException { if (processing == 0) { super.startPrefixMapping(prefix, uri); } } /** * End the scope of a prefix-URI mapping. * * @param prefix The prefix that was being mapping. */ public void endPrefixMapping(String prefix) throws SAXException { if (processing == 0) { super.endPrefixMapping(prefix); } } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { //System.out.println("START processing: "+processing+" "+localName); if (processing == 0) { if (LUCENE_URI.equals(namespaceURI)) { // INDEX ACTION if (LUCENE_QUERY_ELEMENT.equals(localName)) { action = ADD_ACTION; // create base parameter String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE); boolean bCreate = sCreate != null && (sCreate.equalsIgnoreCase("yes") || sCreate.equalsIgnoreCase("true")); // analyzer parameter String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE); if (analyzerClassname == null) { analyzerClassname = this.ANALYZER_CLASSNAME_DEFAULT; } Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname); // mergeFactor parameter String sMergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE); int mergeFactor = this.mergeFactor; if (sMergeFactor != null) { mergeFactor = Integer.parseInt(sMergeFactor); } // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; //System.out.println("QUERY Create=" + bCreate + ", Directory=" + directoryName + ", Analyzer=" + analyzerClassname); } try { Directory directory = LuceneCocoonHelper.getDirectory(new File( workDir, directoryName), bCreate); writer = new IndexWriter(directory, analyzer, bCreate); writer.mergeFactor = mergeFactor; } catch (IOException e) { throw new SAXException(e); } processing = 1; } // DELETE ACTION else if (LUCENE_DELETE_ELEMENT.equals(localName)) { action = DELETE_ACTION; // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; } try { Directory directory = LuceneCocoonHelper.getDirectory( new File(workDir, directoryName), false); reader = LuceneCocoonHelper.getIndexReader(directory); //System.out.println("DELETE Directory=" + directoryName); } catch (IOException e) { throw new SAXException(e); } processing = 1; } } else { super.startElement(namespaceURI, localName, qName, atts); } } else if (processing == 1) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) { if (action == ADD_ACTION) { this.bodyDocument = new Document(); } if (action == DELETE_ACTION) { this.term = new Term(atts.getValue(LUCENE_DOCUMENT_FIELD_ATTRIBUTE), atts.getValue(LUCENE_DOCUMENT_VALUE_ATTRIBUTE)); } processing = 2; } else { throw new SAXException( " element can contain only elements!"); } } else if (processing == 2) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_FIELD_ELEMENT.equals(localName)) { this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE); if (this.fieldname == null || this.fieldname.equals("")) { throw new SAXException( " element must contain name attribut"); } String fieldtype = atts.getValue(LUCENE_FIELD_TYPE_ATTRIBUTE); if (fieldtype == null || fieldtype.equals("")) { throw new SAXException( " element must contain a type attribut"); } if (fieldtype.equals("keyword")) { this.fieldtype = TYPE_KEYWORD; } else if (fieldtype.equals("text")) { this.fieldtype = TYPE_TEXT; } else if (fieldtype.equals("date")) { this.fieldtype = TYPE_DATE; String pattern = atts.getValue(LUCENE_FIELD_DATEFORMAT_ATTRIBUTE); if (pattern == null || pattern.equals("")) { throw new SAXException( " element must contain a dateformat attribut"); } df= new SimpleDateFormat(pattern); } else if (fieldtype.equals("unstored")) { this.fieldtype = TYPE_UNSTORED; } else if (fieldtype.equals("unindexed")) { this.fieldtype = TYPE_UNINDEXED; } String fieldboost = atts.getValue(LUCENE_FIELD_BOOST_ATTRIBUTE); if (fieldboost == null) { this.fieldboost = 1.0f; } else { this.fieldboost = Float.parseFloat(fieldboost); } System.out.println("fieldname: " + fieldname + " type: " + fieldtype + " boost: " + fieldboost); processing = 3; } else { throw new SAXException( " element can contain only elements!"); } } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { //System.out.println("END: processing: " + processing + " el: " + localName); if (processing == 1) { if (LUCENE_URI.equals(namespaceURI)) { //ADD ACTION if (LUCENE_QUERY_ELEMENT.equals(localName)) { // End query processing AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; try { this.writer.optimize(); this.writer.close(); this.writer = null; } catch (IOException e) { throw new SAXException(e); } this.processing = 0; } // DELETE ACTION else if (LUCENE_DELETE_ELEMENT.equals(localName)) { try { AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; this.reader.close(); this.reader = null; } catch (IOException e) { throw new SAXException(e); } this.processing = 0; } } else { if (action == ADD_ACTION) { throw new SAXException("