- java.lang.Object
-
- org.apache.lucene.util.automaton.RegExp
-
public class RegExp extends java.lang.Object
Regular Expression extension toAutomaton
.Regular expressions are built from the following abstract syntax:
description of regular expression grammar regexp ::= unionexp | unionexp ::= interexp |
unionexp(union) | interexp interexp ::= concatexp &
interexp(intersection) [OPTIONAL] | concatexp concatexp ::= repeatexp concatexp (concatenation) | repeatexp repeatexp ::= repeatexp ?
(zero or one occurrence) | repeatexp *
(zero or more occurrences) | repeatexp +
(one or more occurrences) | repeatexp {n}
( n
occurrences)| repeatexp {n,}
( n
or more occurrences)| repeatexp {n,m}
( n
tom
occurrences, including both)| complexp complexp ::= ~
complexp(complement) [OPTIONAL] | charclassexp charclassexp ::= [
charclasses]
(character class) | [^
charclasses]
(negated character class) | simpleexp charclasses ::= charclass charclasses | charclass charclass ::= charexp -
charexp(character range, including end-points) | charexp simpleexp ::= charexp | .
(any single character) | #
(the empty language) [OPTIONAL] | @
(any string) [OPTIONAL] | "
<Unicode string without double-quotes>"
(a string) | (
)
(the empty string) | (
unionexp)
(precedence override) | <
<identifier>>
(named automaton) [OPTIONAL] | <n-m>
(numerical interval) [OPTIONAL] charexp ::= <Unicode character> (a single non-reserved character) | \d
(a digit [0-9]) | \D
(a non-digit [^0-9]) | \s
(whitespace [ \t\n\r]) | \S
(non whitespace [^\s]) | \w
(a word character [a-zA-Z_0-9]) | \W
(a non word character [^\w]) | \
<Unicode character>(a single character) The productions marked [OPTIONAL] are only allowed if specified by the syntax flags passed to the
RegExp
constructor. The reserved characters used in the (enabled) syntax must be escaped with backslash (\
) or double-quotes ("..."
). (In contrast to other regexp syntaxes, this is required also in character classes.) Be aware that dash (-
) has a special meaning in charclass expressions. An identifier is a string not containing right angle bracket (>
) or dash (-
). Numerical intervals are specified by non-negative decimal integers and include both end points, and ifn
andm
have the same number of digits, then the conforming strings must have that length (i.e. prefixed by 0's).
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static class
RegExp.Kind
The type of expression represented by a RegExp node.private static interface
RegExp.MakeRegexGroup
Custom Functional Interface for a Supplying methods with signature of RegExp(int int1, RegExp exp1, RegExp exp2)
-
Field Summary
Fields Modifier and Type Field Description static int
ALL
Syntax flag, enables all optional regexp syntax.static int
ANYSTRING
Syntax flag, enables anystring (@
).static int
ASCII_CASE_INSENSITIVE
Allows case insensitive matching of ASCII characters.static int
AUTOMATON
Syntax flag, enables named automata (<
identifier>
).int
c
Character expressionstatic int
COMPLEMENT
Syntax flag, enables complement (~
).int
digits
Limits for repeatable type expressionsstatic int
EMPTY
Syntax flag, enables empty language (#
).RegExp
exp1
Child expressions held by a container type expressionRegExp
exp2
Child expressions held by a container type expression(package private) int
flags
int
from
Extents for range type expressionsstatic int
INTERSECTION
Syntax flag, enables intersection (&
).static int
INTERVAL
Syntax flag, enables numerical intervals (<n-m>
).RegExp.Kind
kind
The type of expressionint
max
Limits for repeatable type expressionsint
min
Limits for repeatable type expressionsstatic int
NONE
Syntax flag, enables no optional regexp syntax.private java.lang.String
originalString
(package private) int
pos
java.lang.String
s
String expressionint
to
Extents for range type expressions
-
Constructor Summary
Constructors Constructor Description RegExp(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2, java.lang.String s, int c, int min, int max, int digits, int from, int to)
RegExp(java.lang.String s)
Constructs newRegExp
from a string.RegExp(java.lang.String s, int syntax_flags)
Constructs newRegExp
from a string.RegExp(java.lang.String s, int syntax_flags, int match_flags)
Constructs newRegExp
from a string.
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Modifier and Type Method Description private boolean
check(int flag)
(package private) RegExp
expandPredefined()
private void
findLeaves(RegExp exp, RegExp.Kind kind, java.util.List<Automaton> list, java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit)
java.util.Set<java.lang.String>
getIdentifiers()
Returns set of automaton identifiers that occur in this regular expression.(package private) void
getIdentifiers(java.util.Set<java.lang.String> set)
java.lang.String
getOriginalString()
The string that was used to construct the regex.(package private) RegExp
iterativeParseExp(java.util.function.Supplier<RegExp> gather, java.util.function.BooleanSupplier stop, RegExp.MakeRegexGroup associativeReduce)
(package private) static RegExp
makeAnyChar(int flags)
(package private) static RegExp
makeAnyString(int flags)
(package private) static RegExp
makeAutomaton(int flags, java.lang.String s)
(package private) static RegExp
makeChar(int flags, int c)
(package private) static RegExp
makeCharRange(int flags, int from, int to)
(package private) static RegExp
makeComplement(int flags, RegExp exp)
(package private) static RegExp
makeConcatenation(int flags, RegExp exp1, RegExp exp2)
(package private) static RegExp
makeEmpty(int flags)
(package private) static RegExp
makeIntersection(int flags, RegExp exp1, RegExp exp2)
(package private) static RegExp
makeInterval(int flags, int min, int max, int digits)
(package private) static RegExp
makeOptional(int flags, RegExp exp)
(package private) static RegExp
makeRepeat(int flags, RegExp exp)
(package private) static RegExp
makeRepeat(int flags, RegExp exp, int min)
(package private) static RegExp
makeRepeat(int flags, RegExp exp, int min, int max)
(package private) static RegExp
makeString(int flags, java.lang.String s)
private static RegExp
makeString(int flags, RegExp exp1, RegExp exp2)
(package private) static RegExp
makeUnion(int flags, RegExp exp1, RegExp exp2)
private boolean
match(int c)
(package private) RegExp
matchPredefinedCharacterClass()
private boolean
more()
(package private) static RegExp
newContainerNode(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2)
(package private) static RegExp
newLeafNode(int flags, RegExp.Kind kind, java.lang.String s, int c, int min, int max, int digits, int from, int to)
(package private) static RegExp
newRepeatingNode(int flags, RegExp.Kind kind, RegExp exp, int min, int max)
private int
next()
(package private) RegExp
parseCharClass()
(package private) RegExp
parseCharClasses()
(package private) RegExp
parseCharClassExp()
(package private) int
parseCharExp()
(package private) RegExp
parseComplExp()
(package private) RegExp
parseConcatExp()
(package private) RegExp
parseInterExp()
(package private) RegExp
parseRepeatExp()
(package private) RegExp
parseSimpleExp()
(package private) RegExp
parseUnionExp()
private boolean
peek(java.lang.String s)
Automaton
toAutomaton()
Constructs newAutomaton
from thisRegExp
.Automaton
toAutomaton(int determinizeWorkLimit)
Constructs newAutomaton
from thisRegExp
.Automaton
toAutomaton(java.util.Map<java.lang.String,Automaton> automata, int determinizeWorkLimit)
Constructs newAutomaton
from thisRegExp
.private Automaton
toAutomaton(java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit)
Automaton
toAutomaton(AutomatonProvider automaton_provider, int determinizeWorkLimit)
Constructs newAutomaton
from thisRegExp
.private Automaton
toAutomatonInternal(java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit)
private Automaton
toCaseInsensitiveChar(int codepoint, int determinizeWorkLimit)
private Automaton
toCaseInsensitiveString(int determinizeWorkLimit)
java.lang.String
toString()
Constructs string from parsed regular expression.(package private) void
toStringBuilder(java.lang.StringBuilder b)
java.lang.String
toStringTree()
Like to string, but more verbose (shows the higherchy more clearly).(package private) void
toStringTree(java.lang.StringBuilder b, java.lang.String indent)
-
-
-
Field Detail
-
INTERSECTION
public static final int INTERSECTION
Syntax flag, enables intersection (&
).- See Also:
- Constant Field Values
-
COMPLEMENT
public static final int COMPLEMENT
Syntax flag, enables complement (~
).- See Also:
- Constant Field Values
-
EMPTY
public static final int EMPTY
Syntax flag, enables empty language (#
).- See Also:
- Constant Field Values
-
ANYSTRING
public static final int ANYSTRING
Syntax flag, enables anystring (@
).- See Also:
- Constant Field Values
-
AUTOMATON
public static final int AUTOMATON
Syntax flag, enables named automata (<
identifier>
).- See Also:
- Constant Field Values
-
INTERVAL
public static final int INTERVAL
Syntax flag, enables numerical intervals (<n-m>
).- See Also:
- Constant Field Values
-
ALL
public static final int ALL
Syntax flag, enables all optional regexp syntax.- See Also:
- Constant Field Values
-
NONE
public static final int NONE
Syntax flag, enables no optional regexp syntax.- See Also:
- Constant Field Values
-
ASCII_CASE_INSENSITIVE
public static final int ASCII_CASE_INSENSITIVE
Allows case insensitive matching of ASCII characters.- See Also:
- Constant Field Values
-
kind
public final RegExp.Kind kind
The type of expression
-
exp1
public final RegExp exp1
Child expressions held by a container type expression
-
exp2
public final RegExp exp2
Child expressions held by a container type expression
-
s
public final java.lang.String s
String expression
-
c
public final int c
Character expression
-
min
public final int min
Limits for repeatable type expressions
-
max
public final int max
Limits for repeatable type expressions
-
digits
public final int digits
Limits for repeatable type expressions
-
from
public final int from
Extents for range type expressions
-
to
public final int to
Extents for range type expressions
-
originalString
private final java.lang.String originalString
-
flags
final int flags
-
pos
int pos
-
-
Constructor Detail
-
RegExp
public RegExp(java.lang.String s) throws java.lang.IllegalArgumentException
Constructs newRegExp
from a string. Same asRegExp(s, ALL)
.- Parameters:
s
- regexp string- Throws:
java.lang.IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
public RegExp(java.lang.String s, int syntax_flags) throws java.lang.IllegalArgumentException
Constructs newRegExp
from a string.- Parameters:
s
- regexp stringsyntax_flags
- boolean 'or' of optional syntax constructs to be enabled- Throws:
java.lang.IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
public RegExp(java.lang.String s, int syntax_flags, int match_flags) throws java.lang.IllegalArgumentException
Constructs newRegExp
from a string.- Parameters:
s
- regexp stringsyntax_flags
- boolean 'or' of optional syntax constructs to be enabledmatch_flags
- boolean 'or' of match behavior options such as case insensitivity- Throws:
java.lang.IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
RegExp(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2, java.lang.String s, int c, int min, int max, int digits, int from, int to)
-
-
Method Detail
-
newContainerNode
static RegExp newContainerNode(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2)
-
newRepeatingNode
static RegExp newRepeatingNode(int flags, RegExp.Kind kind, RegExp exp, int min, int max)
-
newLeafNode
static RegExp newLeafNode(int flags, RegExp.Kind kind, java.lang.String s, int c, int min, int max, int digits, int from, int to)
-
toAutomaton
public Automaton toAutomaton()
Constructs newAutomaton
from thisRegExp
. Same astoAutomaton(null)
(empty automaton map).
-
toAutomaton
public Automaton toAutomaton(int determinizeWorkLimit) throws java.lang.IllegalArgumentException, TooComplexToDeterminizeException
Constructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
determinizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes. UseOperations.DEFAULT_DETERMINIZE_WORK_LIMIT
as a decent default if you don't otherwise know what to specify.- Throws:
java.lang.IllegalArgumentException
- if this regular expression uses a named identifier that is not available from the automaton providerTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
public Automaton toAutomaton(AutomatonProvider automaton_provider, int determinizeWorkLimit) throws java.lang.IllegalArgumentException, TooComplexToDeterminizeException
Constructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
automaton_provider
- provider of automata for named identifiersdeterminizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes. UseOperations.DEFAULT_DETERMINIZE_WORK_LIMIT
as a decent default if you don't otherwise know what to specify.- Throws:
java.lang.IllegalArgumentException
- if this regular expression uses a named identifier that is not available from the automaton providerTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
public Automaton toAutomaton(java.util.Map<java.lang.String,Automaton> automata, int determinizeWorkLimit) throws java.lang.IllegalArgumentException, TooComplexToDeterminizeException
Constructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
automata
- a map from automaton identifiers to automata (of typeAutomaton
).determinizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes.- Throws:
java.lang.IllegalArgumentException
- if this regular expression uses a named identifier that does not occur in the automaton mapTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
private Automaton toAutomaton(java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) throws java.lang.IllegalArgumentException, TooComplexToDeterminizeException
- Throws:
java.lang.IllegalArgumentException
TooComplexToDeterminizeException
-
toAutomatonInternal
private Automaton toAutomatonInternal(java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
toCaseInsensitiveChar
private Automaton toCaseInsensitiveChar(int codepoint, int determinizeWorkLimit)
-
toCaseInsensitiveString
private Automaton toCaseInsensitiveString(int determinizeWorkLimit)
-
findLeaves
private void findLeaves(RegExp exp, RegExp.Kind kind, java.util.List<Automaton> list, java.util.Map<java.lang.String,Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit)
-
getOriginalString
public java.lang.String getOriginalString()
The string that was used to construct the regex. Compare to toString.
-
toString
public java.lang.String toString()
Constructs string from parsed regular expression.- Overrides:
toString
in classjava.lang.Object
-
toStringBuilder
void toStringBuilder(java.lang.StringBuilder b)
-
toStringTree
public java.lang.String toStringTree()
Like to string, but more verbose (shows the higherchy more clearly).
-
toStringTree
void toStringTree(java.lang.StringBuilder b, java.lang.String indent)
-
getIdentifiers
public java.util.Set<java.lang.String> getIdentifiers()
Returns set of automaton identifiers that occur in this regular expression.
-
getIdentifiers
void getIdentifiers(java.util.Set<java.lang.String> set)
-
makeChar
static RegExp makeChar(int flags, int c)
-
makeCharRange
static RegExp makeCharRange(int flags, int from, int to)
-
makeAnyChar
static RegExp makeAnyChar(int flags)
-
makeEmpty
static RegExp makeEmpty(int flags)
-
makeString
static RegExp makeString(int flags, java.lang.String s)
-
makeAnyString
static RegExp makeAnyString(int flags)
-
makeAutomaton
static RegExp makeAutomaton(int flags, java.lang.String s)
-
makeInterval
static RegExp makeInterval(int flags, int min, int max, int digits)
-
peek
private boolean peek(java.lang.String s)
-
match
private boolean match(int c)
-
more
private boolean more()
-
next
private int next() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
check
private boolean check(int flag)
-
parseUnionExp
final RegExp parseUnionExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseInterExp
final RegExp parseInterExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseConcatExp
final RegExp parseConcatExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
iterativeParseExp
final RegExp iterativeParseExp(java.util.function.Supplier<RegExp> gather, java.util.function.BooleanSupplier stop, RegExp.MakeRegexGroup associativeReduce) throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseRepeatExp
final RegExp parseRepeatExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseComplExp
final RegExp parseComplExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseCharClassExp
final RegExp parseCharClassExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseCharClasses
final RegExp parseCharClasses() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseCharClass
final RegExp parseCharClass() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
expandPredefined
RegExp expandPredefined()
-
matchPredefinedCharacterClass
final RegExp matchPredefinedCharacterClass()
-
parseSimpleExp
final RegExp parseSimpleExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
parseCharExp
final int parseCharExp() throws java.lang.IllegalArgumentException
- Throws:
java.lang.IllegalArgumentException
-
-