ICU 51.2  51.2
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
65 class RuleBasedBreakIterator;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
71 #ifndef U_HIDE_INTERNAL_API
72 
76 #ifdef REGEX_DEBUG
77 U_INTERNAL void U_EXPORT2
78  RegexPatternDump(const RegexPattern *pat);
79 #else
80  #undef RegexPatternDump
81  #define RegexPatternDump(pat)
82 #endif
83 #endif /* U_HIDE_INTERNAL_API */
84 
85 
86 
99 public:
100 
108  RegexPattern();
109 
116  RegexPattern(const RegexPattern &source);
117 
123  virtual ~RegexPattern();
124 
133  UBool operator==(const RegexPattern& that) const;
134 
143  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
144 
150  RegexPattern &operator =(const RegexPattern &source);
151 
159  virtual RegexPattern *clone() const;
160 
161 
186  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
187  UParseError &pe,
188  UErrorCode &status);
189 
216  static RegexPattern * U_EXPORT2 compile( UText *regex,
217  UParseError &pe,
218  UErrorCode &status);
219 
244  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
245  uint32_t flags,
246  UParseError &pe,
247  UErrorCode &status);
248 
275  static RegexPattern * U_EXPORT2 compile( UText *regex,
276  uint32_t flags,
277  UParseError &pe,
278  UErrorCode &status);
279 
302  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
303  uint32_t flags,
304  UErrorCode &status);
305 
330  static RegexPattern * U_EXPORT2 compile( UText *regex,
331  uint32_t flags,
332  UErrorCode &status);
333 
339  virtual uint32_t flags() const;
340 
358  virtual RegexMatcher *matcher(const UnicodeString &input,
359  UErrorCode &status) const;
360 
361 private:
375  RegexMatcher *matcher(const UChar *input,
376  UErrorCode &status) const;
377 public:
378 
379 
391  virtual RegexMatcher *matcher(UErrorCode &status) const;
392 
393 
408  static UBool U_EXPORT2 matches(const UnicodeString &regex,
409  const UnicodeString &input,
410  UParseError &pe,
411  UErrorCode &status);
412 
427  static UBool U_EXPORT2 matches(UText *regex,
428  UText *input,
429  UParseError &pe,
430  UErrorCode &status);
431 
440  virtual UnicodeString pattern() const;
441 
442 
453  virtual UText *patternText(UErrorCode &status) const;
454 
455 
494  virtual int32_t split(const UnicodeString &input,
495  UnicodeString dest[],
496  int32_t destCapacity,
497  UErrorCode &status) const;
498 
499 
538  virtual int32_t split(UText *input,
539  UText *dest[],
540  int32_t destCapacity,
541  UErrorCode &status) const;
542 
543 
549  virtual UClassID getDynamicClassID() const;
550 
556  static UClassID U_EXPORT2 getStaticClassID();
557 
558 private:
559  //
560  // Implementation Data
561  //
562  UText *fPattern; // The original pattern string.
563  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
564  uint32_t fFlags; // The flags used when compiling the pattern.
565  //
566  UVector64 *fCompiledPat; // The compiled pattern p-code.
567  UnicodeString fLiteralText; // Any literal string data from the pattern,
568  // after un-escaping, for use during the match.
569 
570  UVector *fSets; // Any UnicodeSets referenced from the pattern.
571  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
572 
573 
574  UErrorCode fDeferredStatus; // status if some prior error has left this
575  // RegexPattern in an unusable state.
576 
577  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
578  // >= this value. For some patterns, this calculated
579  // value may be less than the true shortest
580  // possible match.
581 
582  int32_t fFrameSize; // Size of a state stack frame in the
583  // execution engine.
584 
585  int32_t fDataSize; // The size of the data needed by the pattern that
586  // does not go on the state stack, but has just
587  // a single copy per matcher.
588 
589  UVector32 *fGroupMap; // Map from capture group number to position of
590  // the group's variables in the matcher stack frame.
591 
592  int32_t fMaxCaptureDigits;
593 
594  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
595  // regex character classes, e.g. Word.
596 
597  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
598  // sets for predefined regex classes.
599 
600  int32_t fStartType; // Info on how a match must start.
601  int32_t fInitialStringIdx; //
602  int32_t fInitialStringLen;
603  UnicodeSet *fInitialChars;
604  UChar32 fInitialChar;
605  Regex8BitSet *fInitialChars8;
606  UBool fNeedsAltInput;
607 
608  friend class RegexCompile;
609  friend class RegexMatcher;
610  friend class RegexCImpl;
611 
612  //
613  // Implementation Methods
614  //
615  void init(); // Common initialization, for use by constructors.
616  void zap(); // Common cleanup
617 #ifdef REGEX_DEBUG
618  void dumpOp(int32_t index) const;
619  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
620 #endif
621 
622 };
623 
624 
625 
636 public:
637 
652  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
653 
669  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
670 
692  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
693  uint32_t flags, UErrorCode &status);
694 
716  RegexMatcher(UText *regexp, UText *input,
717  uint32_t flags, UErrorCode &status);
718 
719 private:
733  RegexMatcher(const UnicodeString &regexp, const UChar *input,
734  uint32_t flags, UErrorCode &status);
735 public:
736 
737 
743  virtual ~RegexMatcher();
744 
745 
752  virtual UBool matches(UErrorCode &status);
753 
754 
765  virtual UBool matches(int64_t startIndex, UErrorCode &status);
766 
767 
781  virtual UBool lookingAt(UErrorCode &status);
782 
783 
797  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
798 
799 
812  virtual UBool find();
813 
814 
824  virtual UBool find(int64_t start, UErrorCode &status);
825 
826 
836  virtual UnicodeString group(UErrorCode &status) const;
837 
838 
851  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
852 
853 
859  virtual int32_t groupCount() const;
860 
861 
876  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
877 
893  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
894 
910  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
911 
912 
920  virtual int32_t start(UErrorCode &status) const;
921 
929  virtual int64_t start64(UErrorCode &status) const;
930 
931 
945  virtual int32_t start(int32_t group, UErrorCode &status) const;
946 
960  virtual int64_t start64(int32_t group, UErrorCode &status) const;
961 
962 
976  virtual int32_t end(UErrorCode &status) const;
977 
991  virtual int64_t end64(UErrorCode &status) const;
992 
993 
1011  virtual int32_t end(int32_t group, UErrorCode &status) const;
1012 
1030  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1031 
1032 
1041  virtual RegexMatcher &reset();
1042 
1043 
1059  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1060 
1061 
1079  virtual RegexMatcher &reset(const UnicodeString &input);
1080 
1081 
1095  virtual RegexMatcher &reset(UText *input);
1096 
1097 
1122  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1123 
1124 private:
1138  RegexMatcher &reset(const UChar *input);
1139 public:
1140 
1148  virtual const UnicodeString &input() const;
1149 
1158  virtual UText *inputText() const;
1159 
1170  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1171 
1172 
1191  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1192 
1204  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1205 
1214  virtual int32_t regionStart() const;
1215 
1224  virtual int64_t regionStart64() const;
1225 
1226 
1235  virtual int32_t regionEnd() const;
1236 
1245  virtual int64_t regionEnd64() const;
1246 
1255  virtual UBool hasTransparentBounds() const;
1256 
1275  virtual RegexMatcher &useTransparentBounds(UBool b);
1276 
1277 
1285  virtual UBool hasAnchoringBounds() const;
1286 
1287 
1300  virtual RegexMatcher &useAnchoringBounds(UBool b);
1301 
1302 
1315  virtual UBool hitEnd() const;
1316 
1326  virtual UBool requireEnd() const;
1327 
1328 
1334  virtual const RegexPattern &pattern() const;
1335 
1336 
1353  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1354 
1355 
1376  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1377 
1378 
1399  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1400 
1401 
1426  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1427 
1428 
1456  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1457  const UnicodeString &replacement, UErrorCode &status);
1458 
1459 
1487  virtual RegexMatcher &appendReplacement(UText *dest,
1488  UText *replacement, UErrorCode &status);
1489 
1490 
1501  virtual UnicodeString &appendTail(UnicodeString &dest);
1502 
1503 
1517  virtual UText *appendTail(UText *dest, UErrorCode &status);
1518 
1519 
1543  virtual int32_t split(const UnicodeString &input,
1544  UnicodeString dest[],
1545  int32_t destCapacity,
1546  UErrorCode &status);
1547 
1548 
1572  virtual int32_t split(UText *input,
1573  UText *dest[],
1574  int32_t destCapacity,
1575  UErrorCode &status);
1576 
1598  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1599 
1606  virtual int32_t getTimeLimit() const;
1607 
1629  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1630 
1638  virtual int32_t getStackLimit() const;
1639 
1640 
1654  virtual void setMatchCallback(URegexMatchCallback *callback,
1655  const void *context,
1656  UErrorCode &status);
1657 
1658 
1669  virtual void getMatchCallback(URegexMatchCallback *&callback,
1670  const void *&context,
1671  UErrorCode &status);
1672 
1673 
1687  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1688  const void *context,
1689  UErrorCode &status);
1690 
1691 
1702  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1703  const void *&context,
1704  UErrorCode &status);
1705 
1706 #ifndef U_HIDE_INTERNAL_API
1707 
1712  void setTrace(UBool state);
1713 #endif /* U_HIDE_INTERNAL_API */
1714 
1720  static UClassID U_EXPORT2 getStaticClassID();
1721 
1727  virtual UClassID getDynamicClassID() const;
1728 
1729 private:
1730  // Constructors and other object boilerplate are private.
1731  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1732  RegexMatcher(); // default constructor not implemented
1733  RegexMatcher(const RegexPattern *pat);
1734  RegexMatcher(const RegexMatcher &other);
1735  RegexMatcher &operator =(const RegexMatcher &rhs);
1736  void init(UErrorCode &status); // Common initialization
1737  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1738 
1739  friend class RegexPattern;
1740  friend class RegexCImpl;
1741 public:
1742 #ifndef U_HIDE_INTERNAL_API
1743 
1744  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1745 #endif /* U_HIDE_INTERNAL_API */
1746 private:
1747 
1748  //
1749  // MatchAt This is the internal interface to the match engine itself.
1750  // Match status comes back in matcher member variables.
1751  //
1752  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1753  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1754  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1755  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1756  REStackFrame *resetStack();
1757  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1758  void IncrementTime(UErrorCode &status);
1759  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1760 
1761  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1762 
1763  UBool findUsingChunk();
1764  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1765  UBool isChunkWordBoundary(int32_t pos);
1766 
1767  const RegexPattern *fPattern;
1768  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1769  // should delete it when through.
1770 
1771  const UnicodeString *fInput; // The string being matched. Only used for input()
1772  UText *fInputText; // The text being matched. Is never NULL.
1773  UText *fAltInputText; // A shallow copy of the text being matched.
1774  // Only created if the pattern contains backreferences.
1775  int64_t fInputLength; // Full length of the input text.
1776  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1777 
1778  int64_t fRegionStart; // Start of the input region, default = 0.
1779  int64_t fRegionLimit; // End of input region, default to input.length.
1780 
1781  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1782  int64_t fAnchorLimit; // See useAnchoringBounds
1783 
1784  int64_t fLookStart; // Region bounds for look-ahead/behind and
1785  int64_t fLookLimit; // and other boundary tests. See
1786  // useTransparentBounds
1787 
1788  int64_t fActiveStart; // Currently active bounds for matching.
1789  int64_t fActiveLimit; // Usually is the same as region, but
1790  // is changed to fLookStart/Limit when
1791  // entering look around regions.
1792 
1793  UBool fTransparentBounds; // True if using transparent bounds.
1794  UBool fAnchoringBounds; // True if using anchoring bounds.
1795 
1796  UBool fMatch; // True if the last attempted match was successful.
1797  int64_t fMatchStart; // Position of the start of the most recent match
1798  int64_t fMatchEnd; // First position after the end of the most recent match
1799  // Zero if no previous match, even when a region
1800  // is active.
1801  int64_t fLastMatchEnd; // First position after the end of the previous match,
1802  // or -1 if there was no previous match.
1803  int64_t fAppendPosition; // First position after the end of the previous
1804  // appendReplacement(). As described by the
1805  // JavaDoc for Java Matcher, where it is called
1806  // "append position"
1807  UBool fHitEnd; // True if the last match touched the end of input.
1808  UBool fRequireEnd; // True if the last match required end-of-input
1809  // (matched $ or Z)
1810 
1811  UVector64 *fStack;
1812  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1813  // which will contain the capture group results.
1814  // NOT valid while match engine is running.
1815 
1816  int64_t *fData; // Data area for use by the compiled pattern.
1817  int64_t fSmallData[8]; // Use this for data if it's enough.
1818 
1819  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1820  // match engine run. Zero for unlimited.
1821 
1822  int32_t fTime; // Match time, accumulates while matching.
1823  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1824  // Kept separately from fTime to keep as much
1825  // code as possible out of the inline
1826  // StateSave function.
1827 
1828  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1829  // stack, in bytes. Zero for unlimited.
1830 
1831  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1832  // NULL if there is no callback.
1833  const void *fCallbackContext; // User Context ptr for callback function.
1834 
1835  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1836  // NULL if there is no callback.
1837  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1838 
1839 
1840  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1841 
1842  UBool fTraceDebug; // Set true for debug tracing of match engine.
1843 
1844  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1845  // reported, or that permanently disables this matcher.
1846 
1847  RuleBasedBreakIterator *fWordBreakItr;
1848 };
1849 
1851 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1852 #endif