00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018
00019
00020
00045 #include "unicode/utypes.h"
00046
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053
00054 #include "unicode/uregex.h"
00055
00056
00057
00058 U_NAMESPACE_BEGIN
00059
00060 struct Regex8BitSet;
00061 class RegexCImpl;
00062 class RegexMatcher;
00063 class RegexPattern;
00064 struct REStackFrame;
00065 class RuleBasedBreakIterator;
00066 class UnicodeSet;
00067 class UVector;
00068 class UVector32;
00069 class UVector64;
00070
00075 #ifdef REGEX_DEBUG
00076 U_INTERNAL void U_EXPORT2
00077 RegexPatternDump(const RegexPattern *pat);
00078 #else
00079 #undef RegexPatternDump
00080 #define RegexPatternDump(pat)
00081 #endif
00082
00083
00084
00096 class U_I18N_API RegexPattern: public UObject {
00097 public:
00098
00106 RegexPattern();
00107
00114 RegexPattern(const RegexPattern &source);
00115
00121 virtual ~RegexPattern();
00122
00131 UBool operator==(const RegexPattern& that) const;
00132
00141 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00142
00148 RegexPattern &operator =(const RegexPattern &source);
00149
00157 virtual RegexPattern *clone() const;
00158
00159
00184 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00185 UParseError &pe,
00186 UErrorCode &status);
00187
00188
00215 static RegexPattern * U_EXPORT2 compile( UText *regex,
00216 UParseError &pe,
00217 UErrorCode &status);
00218
00243 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00244 uint32_t flags,
00245 UParseError &pe,
00246 UErrorCode &status);
00247
00248
00275 static RegexPattern * U_EXPORT2 compile( UText *regex,
00276 uint32_t flags,
00277 UParseError &pe,
00278 UErrorCode &status);
00279
00280
00303 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00304 uint32_t flags,
00305 UErrorCode &status);
00306
00307
00332 static RegexPattern * U_EXPORT2 compile( UText *regex,
00333 uint32_t flags,
00334 UErrorCode &status);
00335
00336
00342 virtual uint32_t flags() const;
00343
00361 virtual RegexMatcher *matcher(const UnicodeString &input,
00362 UErrorCode &status) const;
00363
00364
00369 enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
00370
00390 virtual RegexMatcher *matcher(UText *input,
00391 PatternIsUTextFlag flag,
00392 UErrorCode &status) const;
00393
00394 private:
00408 RegexMatcher *matcher(const UChar *input,
00409 UErrorCode &status) const;
00410 public:
00411
00412
00424 virtual RegexMatcher *matcher(UErrorCode &status) const;
00425
00426
00441 static UBool U_EXPORT2 matches(const UnicodeString ®ex,
00442 const UnicodeString &input,
00443 UParseError &pe,
00444 UErrorCode &status);
00445
00446
00461 static UBool U_EXPORT2 matches(UText *regex,
00462 UText *input,
00463 UParseError &pe,
00464 UErrorCode &status);
00465
00466
00475 virtual UnicodeString pattern() const;
00476
00477
00488 virtual UText *patternText() const;
00489
00490
00516 virtual int32_t split(const UnicodeString &input,
00517 UnicodeString dest[],
00518 int32_t destCapacity,
00519 UErrorCode &status) const;
00520
00521
00547 virtual int32_t split(UText *input,
00548 UText *dest[],
00549 int32_t destCapacity,
00550 UErrorCode &status) const;
00551
00552
00558 virtual UClassID getDynamicClassID() const;
00559
00565 static UClassID U_EXPORT2 getStaticClassID();
00566
00567 private:
00568
00569
00570
00571 UText *fPattern;
00572 UnicodeString *fPatternString;
00573 uint32_t fFlags;
00574
00575 UVector64 *fCompiledPat;
00576 UnicodeString fLiteralText;
00577
00578
00579 UVector *fSets;
00580 Regex8BitSet *fSets8;
00581
00582
00583 UErrorCode fDeferredStatus;
00584
00585
00586 int32_t fMinMatchLen;
00587
00588
00589
00590
00591 int32_t fFrameSize;
00592
00593
00594 int32_t fDataSize;
00595
00596
00597
00598 UVector32 *fGroupMap;
00599
00600
00601 int32_t fMaxCaptureDigits;
00602
00603 UnicodeSet **fStaticSets;
00604
00605
00606 Regex8BitSet *fStaticSets8;
00607
00608
00609 int32_t fStartType;
00610 int32_t fInitialStringIdx;
00611 int32_t fInitialStringLen;
00612 UnicodeSet *fInitialChars;
00613 UChar32 fInitialChar;
00614 Regex8BitSet *fInitialChars8;
00615 UBool fNeedsAltInput;
00616
00617 friend class RegexCompile;
00618 friend class RegexMatcher;
00619 friend class RegexCImpl;
00620
00621
00622
00623
00624 void init();
00625 void zap();
00626 #ifdef REGEX_DEBUG
00627 void dumpOp(int32_t index) const;
00628 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00629 #endif
00630
00631 };
00632
00633
00634
00644 class U_I18N_API RegexMatcher: public UObject {
00645 public:
00646
00661 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
00662
00678 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00679
00701 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
00702 uint32_t flags, UErrorCode &status);
00703
00725 RegexMatcher(UText *regexp, UText *input,
00726 uint32_t flags, UErrorCode &status);
00727
00728 private:
00742 RegexMatcher(const UnicodeString ®exp, const UChar *input,
00743 uint32_t flags, UErrorCode &status);
00744 public:
00745
00746
00752 virtual ~RegexMatcher();
00753
00754
00761 virtual UBool matches(UErrorCode &status);
00762
00763
00774 virtual UBool matches(int32_t startIndex, UErrorCode &status);
00775
00776
00790 virtual UBool lookingAt(UErrorCode &status);
00791
00792
00806 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00807
00808
00821 virtual UBool find();
00822
00823
00833 virtual UBool find(int32_t start, UErrorCode &status);
00834
00835
00845 virtual UnicodeString group(UErrorCode &status) const;
00846
00847
00852 enum MatcherDestIsUTextFlag { MATCHER_DEST_IS_UTEXT };
00853
00869 virtual UText *group(UText *dest, MatcherDestIsUTextFlag flag, UErrorCode &status) const;
00870
00871
00884 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00885
00886
00902 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00903
00904
00910 virtual int32_t groupCount() const;
00911
00912
00920 virtual int32_t start(UErrorCode &status) const;
00921
00922
00936 virtual int32_t start(int32_t group, UErrorCode &status) const;
00937
00938
00948 virtual int32_t end(UErrorCode &status) const;
00949
00950
00964 virtual int32_t end(int32_t group, UErrorCode &status) const;
00965
00966
00975 virtual RegexMatcher &reset();
00976
00977
00993 virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00994
00995
01013 virtual RegexMatcher &reset(const UnicodeString &input);
01014
01015
01029 virtual RegexMatcher &reset(UText *input);
01030
01031 private:
01045 RegexMatcher &reset(const UChar *input);
01046 public:
01047
01055 virtual const UnicodeString &input() const;
01056
01065 virtual UText *inputText() const;
01066
01076 virtual UText *getInput(UText *dest) const;
01077
01078
01097 virtual RegexMatcher ®ion(int32_t start, int32_t limit, UErrorCode &status);
01098
01099
01108 virtual int32_t regionStart() const;
01109
01110
01119 virtual int32_t regionEnd() const;
01120
01129 virtual UBool hasTransparentBounds() const;
01130
01149 virtual RegexMatcher &useTransparentBounds(UBool b);
01150
01151
01159 virtual UBool hasAnchoringBounds() const;
01160
01161
01174 virtual RegexMatcher &useAnchoringBounds(UBool b);
01175
01176
01189 virtual UBool hitEnd() const;
01190
01200 virtual UBool requireEnd() const;
01201
01202
01208 virtual const RegexPattern &pattern() const;
01209
01210
01227 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01228
01229
01250 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01251
01252
01273 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01274
01275
01300 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01301
01302
01330 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01331 const UnicodeString &replacement, UErrorCode &status);
01332
01333
01361 virtual RegexMatcher &appendReplacement(UText *dest,
01362 UText *replacement, UErrorCode &status);
01363
01364
01375 virtual UnicodeString &appendTail(UnicodeString &dest);
01376
01377
01390 virtual UText *appendTail(UText *dest);
01391
01392
01416 virtual int32_t split(const UnicodeString &input,
01417 UnicodeString dest[],
01418 int32_t destCapacity,
01419 UErrorCode &status);
01420
01421
01445 virtual int32_t split(UText *input,
01446 UText *dest[],
01447 int32_t destCapacity,
01448 UErrorCode &status);
01449
01471 virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01472
01479 virtual int32_t getTimeLimit() const;
01480
01502 virtual void setStackLimit(int32_t limit, UErrorCode &status);
01503
01511 virtual int32_t getStackLimit() const;
01512
01513
01527 virtual void setMatchCallback(URegexMatchCallback *callback,
01528 const void *context,
01529 UErrorCode &status);
01530
01531
01542 virtual void getMatchCallback(URegexMatchCallback *&callback,
01543 const void *&context,
01544 UErrorCode &status);
01545
01546
01552 void setTrace(UBool state);
01553
01554
01560 static UClassID U_EXPORT2 getStaticClassID();
01561
01567 virtual UClassID getDynamicClassID() const;
01568
01569 private:
01570
01571
01572 RegexMatcher();
01573 RegexMatcher(const RegexPattern *pat);
01574 RegexMatcher(const RegexMatcher &other);
01575 RegexMatcher &operator =(const RegexMatcher &rhs);
01576 void init(UErrorCode &status);
01577 void init2(UText *t, UErrorCode &e);
01578
01579 friend class RegexPattern;
01580 friend class RegexCImpl;
01581 public:
01583 void resetPreserveRegion();
01584 private:
01585
01586
01587
01588
01589
01590 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01591 inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
01592 UBool isWordBoundary(int64_t pos);
01593 UBool isUWordBoundary(int64_t pos);
01594 REStackFrame *resetStack();
01595 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01596 void IncrementTime(UErrorCode &status);
01597
01598 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01599
01600 UBool findUsingChunk();
01601 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01602 UBool isChunkWordBoundary(int32_t pos);
01603
01604 const RegexPattern *fPattern;
01605 RegexPattern *fPatternOwned;
01606
01607
01608 const UnicodeString *fInput;
01609 UText *fInputText;
01610 UText *fAltInputText;
01611
01612 int64_t fInputLength;
01613 int32_t fFrameSize;
01614
01615 int64_t fRegionStart;
01616 int64_t fRegionLimit;
01617
01618 int64_t fAnchorStart;
01619 int64_t fAnchorLimit;
01620
01621 int64_t fLookStart;
01622 int64_t fLookLimit;
01623
01624
01625 int64_t fActiveStart;
01626 int64_t fActiveLimit;
01627
01628
01629
01630 UBool fTransparentBounds;
01631 UBool fAnchoringBounds;
01632
01633 UBool fMatch;
01634 int64_t fMatchStart;
01635 int64_t fMatchEnd;
01636
01637
01638 int64_t fLastMatchEnd;
01639
01640 int64_t fAppendPosition;
01641
01642
01643
01644 UBool fHitEnd;
01645 UBool fRequireEnd;
01646
01647
01648 UVector64 *fStack;
01649 REStackFrame *fFrame;
01650
01651
01652
01653 int64_t *fData;
01654 int64_t fSmallData[8];
01655
01656 int32_t fTimeLimit;
01657
01658
01659 int32_t fTime;
01660 int32_t fTickCounter;
01661
01662
01663
01664
01665 int32_t fStackLimit;
01666
01667
01668 URegexMatchCallback *fCallbackFn;
01669
01670 const void *fCallbackContext;
01671
01672 UBool fInputUniStrMaybeMutable;
01673
01674 UBool fTraceDebug;
01675
01676 UErrorCode fDeferredStatus;
01677
01678
01679 RuleBasedBreakIterator *fWordBreakItr;
01680 };
01681
01682 U_NAMESPACE_END
01683 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
01684 #endif