/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001, 2002, 2003 The Apache Software Foundation. All * rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; } PARSER_BEGIN(ZeroPositionIncrementTokenQueryParser) package fr.gouv.culture.sdx.search.lucene.queryparser; import java.util.Vector; import java.io.*; import java.text.*; import java.util.*; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; import org.apache.lucene.search.*; import fr.gouv.culture.sdx.search.lucene.DateField; import fr.gouv.culture.sdx.search.lucene.analysis.Analyzer; import fr.gouv.culture.sdx.search.lucene.analysis.DefaultAnalyzer; /*TODO-MAINTAIN: Modified lucene class (from Lucene CVS Version 1.38) * * This is a class based on a lucene class and has differences * that we should keep up-to-date with the lucene sources * */ /** * This class is generated by JavaCC. The only method that clients should need * to call is parse(). * * The syntax for query strings is as follows: * A Query is a series of clauses. * A clause may be prefixed by: *

a plus (+) or a minus (-) sign, indicating * that the clause is required or prohibited respectively; or *
a term followed by a colon, indicating the field to be searched. * This enables one to construct queries which search multiple fields. *

* * A clause may be either: *

a term, indicating all the documents that contain this term; or *
a nested query, enclosed in parentheses. Note that this may be used * with a +/- prefix to require any of a set of * terms. *

* * Thus, in BNF, the query grammar is: *

 *   Query  ::= ( Clause )*
 *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
 *

* *

* Examples of appropriately formatted queries can be found in the test cases. *

* * @author Brian Goetz * @author Peter Halacsy * @author Tatu Saloranta */ public class ZeroPositionIncrementTokenQueryParser implements QueryParser { private static final int CONJ_NONE = 0; private static final int CONJ_AND = 1; private static final int CONJ_OR = 2; private static final int MOD_NONE = 0; private static final int MOD_NOT = 10; private static final int MOD_REQ = 11; public static final int DEFAULT_OPERATOR_OR = 0; public static final int DEFAULT_OPERATOR_AND = 1; /** The actual operator that parser uses to combine query terms */ private int operator = DEFAULT_OPERATOR_OR; /** * Whether terms of wildcard and prefix queries are to be automatically * lower-cased or not. Default is true. */ boolean lowercaseWildcardTerms = true; Analyzer analyzer; String field; int phraseSlop = 0; Locale locale = Locale.getDefault(); /** Parses a query string, returning a {@link org.apache.lucene.search.Query}. * @param query the query string to be parsed. * @param field the default field for query terms. * @param analyzer used to find terms in the query text. * @throws ParseException if the parsing fails */ static public Query parse(String query, String field, Analyzer analyzer) throws ParseException { ZeroPositionIncrementTokenQueryParser parser = new ZeroPositionIncrementTokenQueryParser(field, analyzer); return parser.parse(query); } /**Constructs a query parser.*/ public ZeroPositionIncrementTokenQueryParser(){ this(new FastCharStream(new StringReader(""))); } /** Constructs a query parser. * @param f the default field for query terms. * @param a used to find terms in the query text. */ public ZeroPositionIncrementTokenQueryParser(String f, Analyzer a) { this(new FastCharStream(new StringReader(""))); analyzer = a; field = f; } /**Sets the fields of the query parser * @param f the default field for query terms. * @param a used to find terms in the query text. */ public void setUp(String f, Analyzer a){ analyzer = a; field = f; } /**Sets the fields of the query parser * @param a used to find terms in the query text. * @param phraseSlop the slop * @param operator the operator */ public void setUp(Analyzer a, int phraseSlop, int operator){ analyzer = a; setPhraseSlop(phraseSlop); setOperator(operator); } /** Sets the fields of the query parser * @param f the default field for query terms. * @param a used to find terms in the query text. * @param phraseSlop the slop * @param operator the operator */ public void setUp(String f, Analyzer a, int phraseSlop, int operator){ field = f; setUp(a, phraseSlop, operator); } /** Parses a query string, returning a * Query. * @param query the query string to be parsed. * @throws ParseException if the parsing fails */ public Query parse(String query) throws ParseException { ReInit(new FastCharStream(new StringReader(query))); try { return Query(field); } catch (TokenMgrError tme) { throw new ParseException(tme.getMessage()); } catch (BooleanQuery.TooManyClauses tmc) { throw new ParseException("Too many boolean clauses"); } } /** * Sets the default slop for phrases. If zero, then exact phrase matches * are required. Default value is zero. */ public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } /** * Gets the default slop for phrases. */ public int getPhraseSlop() { return phraseSlop; } /** * Sets the boolean operator of the QueryParser. * In classic mode (

DEFAULT_OPERATOR_OR) terms without any modifiers
   * are considered optional: for example capital of Hungary is equal to
   * capital OR of OR Hungary.

   * In DEFAULT_OPERATOR_AND terms are considered to be in conjuction: the
   * above mentioned query is parsed as capital AND of AND Hungary
   */
  public void setOperator(int operator) {
    this.operator = operator;
  }

  /**
   * Gets implicit operator setting, which will be either DEFAULT_OPERATOR_AND
   * or DEFAULT_OPERATOR_OR.
   */
  public int getOperator() {
    return operator;
  }

  public void setLowercaseWildcardTerms(boolean lowercaseWildcardTerms) {
    this.lowercaseWildcardTerms = lowercaseWildcardTerms;
  }

  public boolean getLowercaseWildcardTerms() {
    return lowercaseWildcardTerms;
  }

   /**
   * Set locale used by date range parsing.
   */
  public void setLocale(Locale locale) {
    this.locale = locale;
  }

   /**
   * Returns current locale, allowing access by subclasses.
   */
  public Locale getLocale() {
    return locale;
  }

  protected void addClause(Vector clauses, int conj, int mods, Query q) {
    boolean required, prohibited;

    // If this term is introduced by AND, make the preceding term required,
    // unless it's already prohibited
    if (conj == CONJ_AND) {
      BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
      if (!c.prohibited)
        c.required = true;
    }

    if (operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
      // If this term is introduced by OR, make the preceding term optional,
      // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
      // notice if the input is a OR b, first term is parsed as required; without
      // this modification a OR b would parsed as +a OR b
      BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
      if (!c.prohibited)
        c.required = false;
    }

    // We might have been passed a null query; the term might have been
    // filtered away by the analyzer.
    if (q == null)
      return;

    if (operator == DEFAULT_OPERATOR_OR) {
      // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
      // introduced by NOT or -; make sure not to set both.
      prohibited = (mods == MOD_NOT);
      required = (mods == MOD_REQ);
      if (conj == CONJ_AND && !prohibited) {
        required = true;
      }
    } else {
      // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
      // if not PROHIBITED and not introduced by OR
      prohibited = (mods == MOD_NOT);
      required   = (!prohibited && conj != CONJ_OR);
    }
    clauses.addElement(new BooleanClause(q, required, prohibited));
  }

   /**
   * @exception ParseException throw in overridden method to disallow
   */
    protected Query getFieldQuery(String field,
                                  Analyzer analyzer,
                                  String queryText) throws ParseException {
        // Use the analyzer to get all the tokens, and then build a TermQuery,
        // PhraseQuery, or nothing based on the term count

        TokenStream source = analyzer.tokenStream(field,
                new StringReader(queryText));
        Vector v = new Vector();
        org.apache.lucene.analysis.Token t;

        int positionCount = 0;
        boolean severalTokensAtSamePosition = false;

        while (true) {
            try {
                t = source.next();
            } 
	    catch (IOException e) {
                t = null;
            }
            if (t == null) 
		break;
            v.addElement(t);
            if (t.getPositionIncrement() == 1)
                positionCount++;
            else
                severalTokensAtSamePosition = true;
        }
    try {
      source.close();
    } catch (IOException e) {
      // ignore
    }
        if (v.size() == 0)
            return null;
        else if (v.size() == 1) {
            t = (org.apache.lucene.analysis.Token) v.elementAt(0);
            return new TermQuery(new Term(field, t.termText()));
        } else {
            if (severalTokensAtSamePosition) {
                if (positionCount == 1) {
                    BooleanQuery q = new BooleanQuery();
                    for (int i = 0; i < v.size(); i++) {
                        t = (org.apache.lucene.analysis.Token) v.elementAt(i);
                        TermQuery currentQuery = new TermQuery(new Term(field, t.termText()));
                        q.add(currentQuery, false, false);
                    }
                    return q;
                } else {

                    BooleanQuery q = new BooleanQuery();
                    Vector queriesSoFar = new Vector();
                    Vector newQueries = new Vector();
                    PhraseQuery currentQuery;
                    int currentPosition = 0;

                    for (int i = 0; i < v.size(); i++) {
                        t = (org.apache.lucene.analysis.Token) v.elementAt(i);
                        //detect new position
                        if (t.getPositionIncrement() == 1) {
                            queriesSoFar.removeAllElements();
                            for (int j = 0; j < newQueries.size(); j++) {
                                queriesSoFar.add(newQueries.elementAt(j));
                            }
                            newQueries.removeAllElements();
                            currentPosition++;
                        }
                        if (currentPosition == 1) {
                            currentQuery = new PhraseQuery();
                            currentQuery.setSlop(phraseSlop);
                            currentQuery.add(new Term(field, t.termText()));
                            newQueries.add(currentQuery);
                            //Re-use previous queries
                        } else {
                            for (int j = 0; j < queriesSoFar.size(); j++) {
                                PhraseQuery previousQuery = (PhraseQuery) queriesSoFar.elementAt(j);
                                Term[] terms = previousQuery.getTerms();
                                currentQuery = new PhraseQuery();
                                currentQuery.setSlop(phraseSlop);
                                for (int k = 0; k < terms.length; k++) {
                                    currentQuery.add(terms[k]);
                                }
                                currentQuery.add(new Term(field, t.termText()));
                                newQueries.add(currentQuery);
                            }
                        }
                    }
                    for (int i = 0; i < newQueries.size(); i++) {
                        currentQuery = (PhraseQuery) newQueries.elementAt(i);
                        q.add(currentQuery, false, false);
                    }
                    return q;
                }
            } else {
                PhraseQuery q = new PhraseQuery();
                q.setSlop(phraseSlop);
                for (int i = 0; i < v.size(); i++) {
                    q.add(new Term(field, ((org.apache.lucene.analysis.Token) v.elementAt(i)).termText()));
                }
                return q;
            }
       }
  }

  protected Query getUnanalyzedQuery(String field, String queryText) {
    return new UnanalyzedQuery(new Term(field, queryText));
  }

  /**
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getRangeQuery(String field,
                              Analyzer analyzer,
                              String part1,
                              String part2,
                              boolean inclusive) throws ParseException
  {
    try {
      /*Lucene code:
      DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale);
      df.setLenient(true);
      Date d1 = df.parse(part1);
      Date d2 = df.parse(part2);
      part1 = DateField.dateToString(d1);
      part2 = DateField.dateToString(d2);
      */

      //since we use our own date format when we store date fields, we should do the same upon searching-rbp
      Date d1 = fr.gouv.culture.sdx.utils.Date.parseDate(part1);
      Date d2 = fr.gouv.culture.sdx.utils.Date.parseDate(part2);
      //using sdx date field support-rbp
      part1 = fr.gouv.culture.sdx.search.lucene.DateField.dateToString(d1);
      part2 = fr.gouv.culture.sdx.search.lucene.DateField.dateToString(d2);
    }
    catch (Exception e) { }

    return new RangeQuery(new Term(field, part1),
                          new Term(field, part2),
                          inclusive);
  }

  /**
   * Factory method for generating query, given a set of clauses.
   * By default creates a boolean query composed of clauses passed in.
   *
   * Can be overridden by extending classes, to modify query being
   * returned.
   *
   * @param clauses Vector that contains {@link BooleanClause} instances
   *    to join.
   *
   * @return Resulting {@link Query} object.
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getBooleanQuery(Vector clauses) throws ParseException
  {
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < clauses.size(); i++) {
	query.add((BooleanClause)clauses.elementAt(i));
    }
    return query;
  }

  /**
   * Factory method for generating a query. Called when parser
   * parses an input term token that contains one or more wildcard
   * characters (? and *), but is not a prefix term token (one
   * that has just a single * character at the end)
   *
   * Depending on settings, prefix term may be lower-cased
   * automatically. It will not go through the default Analyzer,
   * however, since normal Analyzers are unlikely to work properly
   * with wildcard templates.
   *

   * Can be overridden by extending classes, to provide custom handling for
   * wildcard queries, which may be necessary due to missing analyzer calls.
   *
   * @param field Name of the field query will use.
   * @param termStr Term token that contains one or more wild card
   *   characters (? or *), but is not simple prefix term
   *
   * @return Resulting {@link Query} built for the term
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getWildcardQuery(String field, String termStr) throws ParseException
  {
    if (lowercaseWildcardTerms) {
	termStr = termStr.toLowerCase();
    }
    Term t = new Term(field, termStr);
    return new WildcardQuery(t);
  }

  /**
   * Factory method for generating a query (similar to
   * ({@link #getWildcardQuery}). Called when parser parses an input term
   * token that uses prefix notation; that is, contains a single '*' wildcard
   * character as its last character. Since this is a special case
   * of generic wildcard term, and such a query can be optimized easily,
   * this usually results in a different query object.
   *

   * Depending on settings, a prefix term may be lower-cased
   * automatically. It will not go through the default Analyzer,
   * however, since normal Analyzers are unlikely to work properly
   * with wildcard templates.
   *

   * Can be overridden by extending classes, to provide custom handling for
   * wild card queries, which may be necessary due to missing analyzer calls.
   *
   * @param field Name of the field query will use.
   * @param termStr Term token to use for building term for the query
   *    (without trailing '*' character!)
   *
   * @return Resulting {@link Query} built for the term
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getPrefixQuery(String field, String termStr) throws ParseException
  {
    if (lowercaseWildcardTerms) {
	termStr = termStr.toLowerCase();
    }
    Term t = new Term(field, termStr);
    return new PrefixQuery(t);
  }

  /**
   * Factory method for generating a query (similar to
   * ({@link #getWildcardQuery}). Called when parser parses
   * an input term token that has the fuzzy suffix (~) appended.
   *
   * @param field Name of the field query will use.
   * @param termStr Term token to use for building term for the query
   *
   * @return Resulting {@link Query} built for the term
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getFuzzyQuery(String field, String termStr) throws ParseException
  {
    Term t = new Term(field, termStr);
    return new FuzzyQuery(t);
  }

  public static void main(String[] args) throws Exception {
    ZeroPositionIncrementTokenQueryParser  qp = new ZeroPositionIncrementTokenQueryParser("field",
                           new DefaultAnalyzer());
    Query q = qp.parse(args[0]);
    System.out.println(q.toString("field"));
  }
}

PARSER_END(ZeroPositionIncrementTokenQueryParser)

/* ***************** */
/* Token Definitions */
/* ***************** */

<*> TOKEN : {
  <#_NUM_CHAR:   ["0"-"9"] >
| <#_ESCAPED_CHAR: "\\" [ "\\", "+", "-", "!", "(", ")", ":", "^",
                          "[", "]", "\"", "{", "}", "~", "*", "?" ] >
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^",
                           "[", "]", "\"", "{", "}", "~", "*", "?" ]
                       | <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> ) >
| <#_WHITESPACE: ( " " | "\t" ) >
}

 SKIP : {
  <<_WHITESPACE>>
}

// OG: to support prefix queries:
// http://nagoya.apache.org/bugzilla/show_bug.cgi?id=12137
// Change from:
// | 
//              (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
// To:
//
// |  | ( [ "*", "?" ] ))* >

 TOKEN : {
  
| 
| 
| 
| 
| 
| 
| 
|  : Boost
| 
| 
|  (<_TERM_CHAR>)*  >
| 
| )+ >
|  (<_TERM_CHAR>)* "*" >
|  | ( [ "*", "?" ] ))* >
|  : RangeIn
|  : RangeEx
}

 TOKEN : {
)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}

 TOKEN : {

|  : DEFAULT
| 
| 
}

 TOKEN : {

|  : DEFAULT
| 
| 
}

// *   Query  ::= ( Clause )*
// *   Clause ::= ["+", "-"] [ ":"] (  | "(" Query ")" )

int Conjunction() : {
  int ret = CONJ_NONE;
}
{
  [
     { ret = CONJ_AND; }
    |   { ret = CONJ_OR; }
  ]
  { return ret; }
}

int Modifiers() : {
  int ret = MOD_NONE;
}
{
  [
      { ret = MOD_REQ; }
     |  { ret = MOD_NOT; }
     |  { ret = MOD_NOT; }
  ]
  { return ret; }
}

Query Query(String field) :
{
  Vector clauses = new Vector();
  Query q, firstQuery=null;
  int conj, mods;
}
{
  mods=Modifiers() q=Clause(field)
  {
    addClause(clauses, CONJ_NONE, mods, q);
    if (mods == MOD_NONE)
        firstQuery=q;
  }
  (
    conj=Conjunction() mods=Modifiers() q=Clause(field)
    { addClause(clauses, conj, mods, q); }
  )*
    {
      if (clauses.size() == 1 && firstQuery != null)
        return firstQuery;
      else {
	return getBooleanQuery(clauses);
      }
    }
}

Query Clause(String field) : {
  Query q;
  Token fieldToken=null, boost=null;
}
{
  [
    LOOKAHEAD(2)
    fieldToken=  { field = fieldToken.image; }
  ]

  (
   q=Term(field)
   |  q=Query(field)  ( boost=)?

  )
    {
      if (boost != null) {
      	float f = (float)1.0;
	try {
	  f = Float.valueOf(boost.image).floatValue();
          q.setBoost(f);
	} catch (Exception ignored) { }
      }
      return q;
    }
}


Query Term(String field) : {
  Token term, boost=null, slop=null, goop1, goop2;
  boolean prefix = false;
  boolean wildcard = false;
  boolean fuzzy = false;
  boolean rangein = false;
  Query q;
}
{
  (
     (
       term=
       | term= { prefix=true; }
       | term= { wildcard=true; }
       | term=
     )
     [  { fuzzy=true; } ]
     [  boost= [  { fuzzy=true; } ] ]
     {
       if (wildcard) {
	 q = getWildcardQuery(field, term.image);
       } else if (prefix) {
         q = getPrefixQuery(field, term.image.substring
			    (0, term.image.length()-1));
       } else if (fuzzy) {
         q = getFuzzyQuery(field, term.image);
       } else {
         q = getFieldQuery(field, analyzer, term.image);
       }
     }
     | (  ( goop1=|goop1= )
         [  ] ( goop2=|goop2= )
          )
       [  boost= ]
        {
          if (goop1.kind == RANGEIN_QUOTED)
            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
          if (goop2.kind == RANGEIN_QUOTED)
            goop2.image = goop2.image.substring(1, goop2.image.length()-1);

          q = getRangeQuery(field, analyzer, goop1.image, goop2.image, true);
        }
     | (  ( goop1=|goop1= )
         [  ] ( goop2=|goop2= )
          )
       [  boost= ]
        {
          if (goop1.kind == RANGEEX_QUOTED)
            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
          if (goop2.kind == RANGEEX_QUOTED)
            goop2.image = goop2.image.substring(1, goop2.image.length()-1);

          q = getRangeQuery(field, analyzer, goop1.image, goop2.image, false);
        }
     | term=
       [ slop= ]
       [  boost= ]
       {
         q = getFieldQuery(field, analyzer,
                           term.image.substring(1, term.image.length()-1));
         if (slop != null && q instanceof PhraseQuery) {
           try {
             int s = Float.valueOf(slop.image.substring(1)).intValue();
             ((PhraseQuery) q).setSlop(s);
           }
           catch (Exception ignored) { }
         }
       }
       | term=
       [ slop= ]
       [  boost= ]
       {
         q = getUnanalyzedQuery(field, term.image.substring(1, term.image.length()-1));
         if (slop != null && q instanceof PhraseQuery) {
           try {
             int s = Float.valueOf(slop.image.substring(1)).intValue();
             ((PhraseQuery) q).setSlop(s);
           }
           catch (Exception ignored) { }
         }
       }
  )
  {
    if (boost != null) {
      float f = (float) 1.0;
      try {
        f = Float.valueOf(boost.image).floatValue();
      }
      catch (Exception ignored) {
	  /* Should this be handled somehow? (defaults to "no boost", if
	   * boost number is invalid)
	   */
      }

      // avoid boosting null queries, such as those caused by stop words
      if (q != null) {
        q.setBoost(f);
      }
    }
    return q;
  }
}