public abstract class CompoundWordTokenFilterBase extends TokenFilter
Modifier and Type | Field and Description |
---|---|
static int |
DEFAULT_MAX_SUBWORD_SIZE
The default for maximal length of subwords that get propagated to the output of this filter
|
static int |
DEFAULT_MIN_SUBWORD_SIZE
The default for minimal length of subwords that get propagated to the output of this filter
|
static int |
DEFAULT_MIN_WORD_SIZE
The default for minimal word length that gets decomposed
|
protected CharArraySet |
dictionary |
protected int |
maxSubwordSize |
protected int |
minSubwordSize |
protected int |
minWordSize |
protected boolean |
onlyLongestMatch |
protected java.util.LinkedList |
tokens |
input
Modifier | Constructor and Description |
---|---|
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.util.Set dictionary) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.util.Set dictionary,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.util.Set dictionary,
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.lang.String[] dictionary) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.lang.String[] dictionary,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
java.lang.String[] dictionary,
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) |
Modifier and Type | Method and Description |
---|---|
protected static void |
addAllLowerCase(java.util.Set target,
java.util.Collection col) |
protected Token |
createToken(int offset,
int length,
Token prototype) |
protected void |
decompose(Token token) |
protected abstract void |
decomposeInternal(Token token) |
static java.util.Set |
makeDictionary(java.lang.String[] dictionary)
Create a set of words from an array
The resulting Set does case insensitive matching
TODO We should look for a faster dictionary lookup approach.
|
protected static char[] |
makeLowerCaseCopy(char[] buffer) |
Token |
next(Token reusableToken) |
close, reset
next
public static final int DEFAULT_MIN_WORD_SIZE
public static final int DEFAULT_MIN_SUBWORD_SIZE
public static final int DEFAULT_MAX_SUBWORD_SIZE
protected final CharArraySet dictionary
protected final java.util.LinkedList tokens
protected final int minWordSize
protected final int minSubwordSize
protected final int maxSubwordSize
protected final boolean onlyLongestMatch
protected CompoundWordTokenFilterBase(TokenStream input, java.lang.String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, java.lang.String[] dictionary, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, java.util.Set dictionary, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, java.lang.String[] dictionary)
protected CompoundWordTokenFilterBase(TokenStream input, java.util.Set dictionary)
protected CompoundWordTokenFilterBase(TokenStream input, java.util.Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch)
public static final java.util.Set makeDictionary(java.lang.String[] dictionary)
dictionary
- public Token next(Token reusableToken) throws java.io.IOException
next
in class TokenStream
java.io.IOException
protected static final void addAllLowerCase(java.util.Set target, java.util.Collection col)
protected static char[] makeLowerCaseCopy(char[] buffer)
protected void decompose(Token token)
protected abstract void decomposeInternal(Token token)
Copyright © 2000-2014 Apache Software Foundation. All Rights Reserved.