00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef UNICODESET_H
00012 #define UNICODESET_H
00013
00014 #include "unicode/unifilt.h"
00015 #include "unicode/unistr.h"
00016 #include "unicode/uset.h"
00017
00023 U_NAMESPACE_BEGIN
00024
00025 class BMPSet;
00026 class ParsePosition;
00027 class SymbolTable;
00028 class UnicodeSetStringSpan;
00029 class UVector;
00030 class RuleCharacterIterator;
00031
00272 class U_COMMON_API UnicodeSet : public UnicodeFilter {
00273
00274 int32_t len;
00275 int32_t capacity;
00276 UChar32* list;
00277 BMPSet *bmpSet;
00278 UChar32* buffer;
00279 int32_t bufferCapacity;
00280 int32_t patLen;
00281
00291 UChar *pat;
00292 UVector* strings;
00293 UnicodeSetStringSpan *stringSpan;
00294
00295 private:
00296 enum {
00297 kIsBogus = 1
00298 };
00299 uint8_t fFlags;
00300 public:
00310 inline UBool isBogus(void) const;
00311
00328 void setToBogus();
00329
00330 public:
00331
00332 enum {
00337 MIN_VALUE = 0,
00338
00343 MAX_VALUE = 0x10ffff
00344 };
00345
00346
00347
00348
00349
00350 public:
00351
00356 UnicodeSet();
00357
00366 UnicodeSet(UChar32 start, UChar32 end);
00367
00376 UnicodeSet(const UnicodeString& pattern,
00377 UErrorCode& status);
00378
00391 UnicodeSet(const UnicodeString& pattern,
00392 uint32_t options,
00393 const SymbolTable* symbols,
00394 UErrorCode& status);
00395
00409 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
00410 uint32_t options,
00411 const SymbolTable* symbols,
00412 UErrorCode& status);
00413
00418 UnicodeSet(const UnicodeSet& o);
00419
00424 virtual ~UnicodeSet();
00425
00431 UnicodeSet& operator=(const UnicodeSet& o);
00432
00444 virtual UBool operator==(const UnicodeSet& o) const;
00445
00451 UBool operator!=(const UnicodeSet& o) const;
00452
00462 virtual UnicodeFunctor* clone() const;
00463
00471 virtual int32_t hashCode(void) const;
00472
00481 inline static UnicodeSet *fromUSet(USet *uset);
00482
00491 inline static const UnicodeSet *fromUSet(const USet *uset);
00492
00500 inline USet *toUSet();
00501
00502
00510 inline const USet * toUSet() const;
00511
00512
00513
00514
00515
00516
00525 inline UBool isFrozen() const;
00526
00540 UnicodeFunctor *freeze();
00541
00550 UnicodeFunctor *cloneAsThawed() const;
00551
00552
00553
00554
00555
00566 UnicodeSet& set(UChar32 start, UChar32 end);
00567
00573 static UBool resemblesPattern(const UnicodeString& pattern,
00574 int32_t pos);
00575
00588 UnicodeSet& applyPattern(const UnicodeString& pattern,
00589 UErrorCode& status);
00590
00607 UnicodeSet& applyPattern(const UnicodeString& pattern,
00608 uint32_t options,
00609 const SymbolTable* symbols,
00610 UErrorCode& status);
00611
00643 UnicodeSet& applyPattern(const UnicodeString& pattern,
00644 ParsePosition& pos,
00645 uint32_t options,
00646 const SymbolTable* symbols,
00647 UErrorCode& status);
00648
00662 virtual UnicodeString& toPattern(UnicodeString& result,
00663 UBool escapeUnprintable = FALSE) const;
00664
00687 UnicodeSet& applyIntPropertyValue(UProperty prop,
00688 int32_t value,
00689 UErrorCode& ec);
00690
00720 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
00721 const UnicodeString& value,
00722 UErrorCode& ec);
00723
00732 virtual int32_t size(void) const;
00733
00740 virtual UBool isEmpty(void) const;
00741
00749 virtual UBool contains(UChar32 c) const;
00750
00759 virtual UBool contains(UChar32 start, UChar32 end) const;
00760
00768 UBool contains(const UnicodeString& s) const;
00769
00777 virtual UBool containsAll(const UnicodeSet& c) const;
00778
00786 UBool containsAll(const UnicodeString& s) const;
00787
00796 UBool containsNone(UChar32 start, UChar32 end) const;
00797
00805 UBool containsNone(const UnicodeSet& c) const;
00806
00814 UBool containsNone(const UnicodeString& s) const;
00815
00824 inline UBool containsSome(UChar32 start, UChar32 end) const;
00825
00833 inline UBool containsSome(const UnicodeSet& s) const;
00834
00842 inline UBool containsSome(const UnicodeString& s) const;
00843
00862 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00863
00876 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
00877
00895 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00896
00910 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
00911
00930 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00931
00949 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00950
00955 virtual UMatchDegree matches(const Replaceable& text,
00956 int32_t& offset,
00957 int32_t limit,
00958 UBool incremental);
00959
00960 private:
00982 static int32_t matchRest(const Replaceable& text,
00983 int32_t start, int32_t limit,
00984 const UnicodeString& s);
00985
00995 int32_t findCodePoint(UChar32 c) const;
00996
00997 public:
00998
01006 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
01007
01016 int32_t indexOf(UChar32 c) const;
01017
01027 UChar32 charAt(int32_t index) const;
01028
01043 virtual UnicodeSet& add(UChar32 start, UChar32 end);
01044
01052 UnicodeSet& add(UChar32 c);
01053
01065 UnicodeSet& add(const UnicodeString& s);
01066
01067 private:
01073 static int32_t getSingleCP(const UnicodeString& s);
01074
01075 void _add(const UnicodeString& s);
01076
01077 public:
01086 UnicodeSet& addAll(const UnicodeString& s);
01087
01096 UnicodeSet& retainAll(const UnicodeString& s);
01097
01106 UnicodeSet& complementAll(const UnicodeString& s);
01107
01116 UnicodeSet& removeAll(const UnicodeString& s);
01117
01126 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
01127
01128
01136 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
01137
01151 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
01152
01153
01159 UnicodeSet& retain(UChar32 c);
01160
01174 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
01175
01183 UnicodeSet& remove(UChar32 c);
01184
01194 UnicodeSet& remove(const UnicodeString& s);
01195
01203 virtual UnicodeSet& complement(void);
01204
01219 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
01220
01228 UnicodeSet& complement(UChar32 c);
01229
01240 UnicodeSet& complement(const UnicodeString& s);
01241
01254 virtual UnicodeSet& addAll(const UnicodeSet& c);
01255
01267 virtual UnicodeSet& retainAll(const UnicodeSet& c);
01268
01280 virtual UnicodeSet& removeAll(const UnicodeSet& c);
01281
01292 virtual UnicodeSet& complementAll(const UnicodeSet& c);
01293
01300 virtual UnicodeSet& clear(void);
01301
01327 UnicodeSet& closeOver(int32_t attribute);
01328
01335 virtual UnicodeSet &removeAllStrings();
01336
01344 virtual int32_t getRangeCount(void) const;
01345
01353 virtual UChar32 getRangeStart(int32_t index) const;
01354
01362 virtual UChar32 getRangeEnd(int32_t index) const;
01363
01412 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
01413
01420 virtual UnicodeSet& compact();
01421
01433 static UClassID U_EXPORT2 getStaticClassID(void);
01434
01443 virtual UClassID getDynamicClassID(void) const;
01444
01445 private:
01446
01447
01448
01449 friend class USetAccess;
01450
01451 int32_t getStringCount() const;
01452
01453 const UnicodeString* getString(int32_t index) const;
01454
01455
01456
01457
01458
01459 private:
01460
01466 virtual UBool matchesIndexValue(uint8_t v) const;
01467
01468 private:
01469
01470
01471
01472
01473
01474 UnicodeSet(const UnicodeSet& o, UBool );
01475
01476
01477
01478
01479
01480 void applyPattern(RuleCharacterIterator& chars,
01481 const SymbolTable* symbols,
01482 UnicodeString& rebuiltPat,
01483 uint32_t options,
01484 UErrorCode& ec);
01485
01486
01487
01488
01489
01490 void ensureCapacity(int32_t newLen, UErrorCode& ec);
01491
01492 void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
01493
01494 void swapBuffers(void);
01495
01496 UBool allocateStrings(UErrorCode &status);
01497
01498 UnicodeString& _toPattern(UnicodeString& result,
01499 UBool escapeUnprintable) const;
01500
01501 UnicodeString& _generatePattern(UnicodeString& result,
01502 UBool escapeUnprintable) const;
01503
01504 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
01505
01506 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
01507
01508
01509
01510
01511
01512 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
01513
01514 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
01515
01516 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
01517
01523 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
01524 int32_t pos);
01525
01526 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
01527 int32_t iterOpts);
01528
01567 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
01568 ParsePosition& ppos,
01569 UErrorCode &ec);
01570
01571 void applyPropertyPattern(RuleCharacterIterator& chars,
01572 UnicodeString& rebuiltPat,
01573 UErrorCode& ec);
01574
01575 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
01576
01581 typedef UBool (*Filter)(UChar32 codePoint, void* context);
01582
01592 void applyFilter(Filter filter,
01593 void* context,
01594 int32_t src,
01595 UErrorCode &status);
01596
01600 void setPattern(const UnicodeString& newPat);
01604 void releasePattern();
01605
01606 friend class UnicodeSetIterator;
01607 };
01608
01609
01610
01611 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
01612 return !operator==(o);
01613 }
01614
01615 inline UBool UnicodeSet::isFrozen() const {
01616 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
01617 }
01618
01619 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
01620 return !containsNone(start, end);
01621 }
01622
01623 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
01624 return !containsNone(s);
01625 }
01626
01627 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
01628 return !containsNone(s);
01629 }
01630
01631 inline UBool UnicodeSet::isBogus() const {
01632 return (UBool)(fFlags & kIsBogus);
01633 }
01634
01635 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
01636 return reinterpret_cast<UnicodeSet *>(uset);
01637 }
01638
01639 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
01640 return reinterpret_cast<const UnicodeSet *>(uset);
01641 }
01642
01643 inline USet *UnicodeSet::toUSet() {
01644 return reinterpret_cast<USet *>(this);
01645 }
01646
01647 inline const USet *UnicodeSet::toUSet() const {
01648 return reinterpret_cast<const USet *>(this);
01649 }
01650
01651 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
01652 int32_t sLength=s.length();
01653 if(start<0) {
01654 start=0;
01655 } else if(start>sLength) {
01656 start=sLength;
01657 }
01658 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
01659 }
01660
01661 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
01662 int32_t sLength=s.length();
01663 if(limit<0) {
01664 limit=0;
01665 } else if(limit>sLength) {
01666 limit=sLength;
01667 }
01668 return spanBack(s.getBuffer(), limit, spanCondition);
01669 }
01670
01671 U_NAMESPACE_END
01672
01673 #endif