Re: Re: ANN: acts_as_ferret and a fix for unicode

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

Re: Re: ANN: acts_as_ferret and a fix for unicode

hui-5
It's so cool!
I am just looking for the CJK solutions,
Here is "JavaCC code for the Nutch lexical analyzer."
Inlucded in Nutch source code, so could anyone port it into ferret?
====================================================
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** JavaCC code for the Nutch lexical analyzer. */

options {
  STATIC = false;
  USER_CHAR_STREAM = true;
  OPTIMIZE_TOKEN_MANAGER = true;
  UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(NutchAnalysis)

package org.apache.nutch.analysis;

import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;

import org.apache.lucene.analysis.StopFilter;

import java.io.*;
import java.util.*;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {

  private static final String[] STOP_WORDS = {
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };

  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);

  private String queryString;

  /** True iff word is a stop word.  Stop words are only removed from queries.
   * Every word is indexed.  */
  public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
  }

  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString) throws IOException {
    NutchAnalysis parser =
      new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
    parser.queryString = queryString;
    return parser.parse();
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Query: ");
      String line = in.readLine();
      System.out.println(parseQuery(line));
    }
  }

}

PARSER_END(NutchAnalysis)

TOKEN_MGR_DECLS : {

  /** Constructs a token manager for the provided Reader. */
  public NutchAnalysisTokenManager(Reader reader) {
    this(new FastCharStream(reader));
  }

}

TOKEN : {  // token regular expressions

  // basic word -- lowercase it
<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
  { matchedToken.image = matchedToken.image.toLowerCase(); }

  // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
| <ACRONYM: <LETTER> "." (<LETTER> ".")+ >
    {                                             // remove dots
      for (int i = 0; i < image.length(); i++) {
        if (image.charAt(i) == '.')
          image.deleteCharAt(i--);
      }
      matchedToken.image = image.toString().toLowerCase();
    }

  // chinese, japanese and korean characters
| <SIGRAM: <CJK> >

   // irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >
| <#C_SHARP: ("C"|"c") "#" >

  // query syntax characters
| <PLUS: "+" >
| <MINUS: "-" >
| <QUOTE: "\"" >
| <COLON: ":" >
| <SLASH: "/" >
| <DOT: "." >
| <ATSIGN: "@" >
| <APOSTROPHE: "'" >

| <WHITE: ~[] >                                   // treat unrecognized chars
                                                  // as whitespace
// primitive, non-token patterns

| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words

| < #LETTER:  // alphabets
    [
        "\u0041"-"\u005a",
        "\u0061"-"\u007a",
        "\u00c0"-"\u00d6",
        "\u00d8"-"\u00f6",
        "\u00f8"-"\u00ff",
        "\u0100"-"\u1fff"
    ]
    >

|  <#CJK:                                        // non-alphabets
      [
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
       "\u3400"-"\u3d2d",
       "\u4e00"-"\u9fff",
       "\uf900"-"\ufaff"
      ]
    >

| < #DIGIT:  // unicode digits
      [
       "\u0030"-"\u0039",
       "\u0660"-"\u0669",
       "\u06f0"-"\u06f9",
       "\u0966"-"\u096f",
       "\u09e6"-"\u09ef",
       "\u0a66"-"\u0a6f",
       "\u0ae6"-"\u0aef",
       "\u0b66"-"\u0b6f",
       "\u0be7"-"\u0bef",
       "\u0c66"-"\u0c6f",
       "\u0ce6"-"\u0cef",
       "\u0d66"-"\u0d6f",
       "\u0e50"-"\u0e59",
       "\u0ed0"-"\u0ed9",
       "\u1040"-"\u1049"
      ]
  >

}


/** Parse a query. */
Query parse() :
{
  Query query = new Query();
  ArrayList terms;
  Token token;
  String field;
  boolean stop;
  boolean prohibited;

}
{
  nonOpOrTerm()                                   // skip noise
  (
    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }

                                                  // optional + or - operator
    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?

                                                  // optional field spec.
    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
      token=<WORD> <COLON> { field = token.image; } )?

    ( terms=phrase(field) {stop=false;} |         // quoted terms or
      terms=compound(field))                      // single or compound term

    nonOpOrTerm()                                 // skip noise

    {
      String[] array = (String[])terms.toArray(new String[terms.size()]);

      if (stop
          && field == Clause.DEFAULT_FIELD
          && terms.size()==1
          && isStopWord(array[0])) {
        // ignore stop words only when single, unadorned terms in default field
      } else {
        if (prohibited)
          query.addProhibitedPhrase(array, field);
        else
          query.addRequiredPhrase(array, field);
      }
    }
  )*

  { return query; }

}

/** Parse an explcitly quoted phrase query.  Note that this may return a single
 * term, a trivial phrase.*/
ArrayList phrase(String field) :
{
  int start;
  int end;
  ArrayList result = new ArrayList();
  String term;
}
{
  <QUOTE>

  { start = token.endColumn; }

  (nonTerm())*                                    // skip noise
  ( term = term() { result.add(term); }           // parse a term
    (nonTerm())*)*                                // skip noise

  { end = token.endColumn; }

  (<QUOTE>|<EOF>)

  {
    if (QueryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, end));
    }
    return result;
  }

}

/** Parse a compound term that is interpreted as an implicit phrase query.
 * Compounds are a sequence of terms separated by infix characters.  Note that
 * htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
  int start;
  ArrayList result = new ArrayList();
  String term;
}
{
  { start = token.endColumn; }

  term = term() { result.add(term); }
  ( LOOKAHEAD( (infix())+ term() )
    (infix())+
    term = term() { result.add(term); })*

  {
    if (QueryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, token.endColumn));
    }
    return result;
  }

}

/** Parse a single term. */
String term() :
{
  Token token;
}
{
  ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)

  { return token.image; }
}


/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
  <WHITE> | infix()
}

void nonTermOrEOF() :
{}
{
  nonTerm() | <EOF>
}

/** Parse anything but a term or an operator (plur or minus or quote). */
void nonOpOrTerm() :
{}
{
  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}

/** Characters which can be used to form compound terms. */
void infix() :
{}
{
  <PLUS> | <MINUS> | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
  <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
}
_______________________________________________
Rails mailing list
[hidden email]
http://lists.rubyonrails.org/mailman/listinfo/rails
Reply | Threaded
Open this post in threaded view
|

Re: Re: ANN: acts_as_ferret and a fix for unicode

Erik Hatcher

On Dec 16, 2005, at 12:14 AM, hui wrote:
> It's so cool!
> I am just looking for the CJK solutions,
> Here is "JavaCC code for the Nutch lexical analyzer."
> Inlucded in Nutch source code, so could anyone port it into ferret?

There are several other Analyzers in Lucene that can deal with CJK  
(and actually Korean doesn't really fit with Chinese and Japanese).  
Lucene's StandardAnalyzer recognizes the CJK range just as the Nutch  
one does, and there are also these additional ones (in the cjk and cn  
directories):

        <http://svn.apache.org/viewcvs.cgi/lucene/java/trunk/contrib/ 
analyzers/src/java/org/apache/lucene/analysis/>

Erik

_______________________________________________
Rails mailing list
[hidden email]
http://lists.rubyonrails.org/mailman/listinfo/rails