public abstract class CompoundWordTokenFilterBase extends TokenFilter
Modifier and Type | Field and Description |
---|---|
static int |
DEFAULT_MAX_SUBWORD_SIZE
The default for maximal length of subwords that get propagated to the output of this filter
|
static int |
DEFAULT_MIN_SUBWORD_SIZE
The default for minimal length of subwords that get propagated to the output of this filter
|
static int |
DEFAULT_MIN_WORD_SIZE
The default for minimal word length that gets decomposed
|
protected CharArraySet |
dictionary |
protected int |
maxSubwordSize |
protected int |
minSubwordSize |
protected int |
minWordSize |
protected boolean |
onlyLongestMatch |
protected LinkedList |
tokens |
input
Modifier | Constructor and Description |
---|---|
protected |
CompoundWordTokenFilterBase(TokenStream input,
Set dictionary) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
Set dictionary,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
Set dictionary,
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
String[] dictionary) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
String[] dictionary,
boolean onlyLongestMatch) |
protected |
CompoundWordTokenFilterBase(TokenStream input,
String[] dictionary,
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) |
Modifier and Type | Method and Description |
---|---|
protected static void |
addAllLowerCase(Set target,
Collection col) |
protected Token |
createToken(int offset,
int length,
Token prototype) |
protected void |
decompose(Token token) |
protected abstract void |
decomposeInternal(Token token) |
static Set |
makeDictionary(String[] dictionary)
Create a set of words from an array
The resulting Set does case insensitive matching
TODO We should look for a faster dictionary lookup approach.
|
protected static char[] |
makeLowerCaseCopy(char[] buffer) |
Token |
next(Token reusableToken) |
close, reset
next
public static final int DEFAULT_MIN_WORD_SIZE
public static final int DEFAULT_MIN_SUBWORD_SIZE
public static final int DEFAULT_MAX_SUBWORD_SIZE
protected final CharArraySet dictionary
protected final LinkedList tokens
protected final int minWordSize
protected final int minSubwordSize
protected final int maxSubwordSize
protected final boolean onlyLongestMatch
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch)
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary)
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary)
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch)
public static final Set makeDictionary(String[] dictionary)
dictionary
- public Token next(Token reusableToken) throws IOException
next
in class TokenStream
IOException
protected static final void addAllLowerCase(Set target, Collection col)
protected static char[] makeLowerCaseCopy(char[] buffer)
protected void decompose(Token token)
protected abstract void decomposeInternal(Token token)
Copyright © 2000-2013 Apache Software Foundation. All Rights Reserved.