00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef UNICODESET_H
00014 #define UNICODESET_H
00015
00016 #include "unicode/utypes.h"
00017
00018 #if U_SHOW_CPLUSPLUS_API
00019
00020 #include "unicode/ucpmap.h"
00021 #include "unicode/unifilt.h"
00022 #include "unicode/unistr.h"
00023 #include "unicode/uset.h"
00024
00030 U_NAMESPACE_BEGIN
00031
00032
00033 class BMPSet;
00034 class ParsePosition;
00035 class RBBIRuleScanner;
00036 class SymbolTable;
00037 class UnicodeSetStringSpan;
00038 class UVector;
00039 class RuleCharacterIterator;
00040
00281 class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
00282 private:
00287 static constexpr int32_t INITIAL_CAPACITY = 25;
00288
00289 static constexpr uint8_t kIsBogus = 1;
00290
00291 UChar32* list = stackList;
00292 int32_t capacity = INITIAL_CAPACITY;
00293 int32_t len = 1;
00294 uint8_t fFlags = 0;
00295
00296 BMPSet *bmpSet = nullptr;
00297 UChar32* buffer = nullptr;
00298 int32_t bufferCapacity = 0;
00299
00309 char16_t *pat = nullptr;
00310 int32_t patLen = 0;
00311
00312 UVector* strings = nullptr;
00313 UnicodeSetStringSpan *stringSpan = nullptr;
00314
00320 UChar32 stackList[INITIAL_CAPACITY];
00321
00322 public:
00332 inline UBool isBogus(void) const;
00333
00350 void setToBogus();
00351
00352 public:
00353
00354 enum {
00359 MIN_VALUE = 0,
00360
00365 MAX_VALUE = 0x10ffff
00366 };
00367
00368
00369
00370
00371
00372 public:
00373
00378 UnicodeSet();
00379
00388 UnicodeSet(UChar32 start, UChar32 end);
00389
00390 #ifndef U_HIDE_INTERNAL_API
00391
00394 enum ESerialization {
00395 kSerialized
00396 };
00397
00408 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
00409 ESerialization serialization, UErrorCode &status);
00410 #endif
00411
00420 UnicodeSet(const UnicodeString& pattern,
00421 UErrorCode& status);
00422
00423 #ifndef U_HIDE_INTERNAL_API
00424
00436 UnicodeSet(const UnicodeString& pattern,
00437 uint32_t options,
00438 const SymbolTable* symbols,
00439 UErrorCode& status);
00440 #endif
00441
00455 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
00456 uint32_t options,
00457 const SymbolTable* symbols,
00458 UErrorCode& status);
00459
00464 UnicodeSet(const UnicodeSet& o);
00465
00470 virtual ~UnicodeSet();
00471
00477 UnicodeSet& operator=(const UnicodeSet& o);
00478
00490 virtual UBool operator==(const UnicodeSet& o) const;
00491
00497 inline UBool operator!=(const UnicodeSet& o) const;
00498
00508 virtual UnicodeSet* clone() const;
00509
00517 virtual int32_t hashCode(void) const;
00518
00527 inline static UnicodeSet *fromUSet(USet *uset);
00528
00537 inline static const UnicodeSet *fromUSet(const USet *uset);
00538
00546 inline USet *toUSet();
00547
00548
00556 inline const USet * toUSet() const;
00557
00558
00559
00560
00561
00562
00571 inline UBool isFrozen() const;
00572
00586 UnicodeSet *freeze();
00587
00596 UnicodeSet *cloneAsThawed() const;
00597
00598
00599
00600
00601
00611 UnicodeSet& set(UChar32 start, UChar32 end);
00612
00618 static UBool resemblesPattern(const UnicodeString& pattern,
00619 int32_t pos);
00620
00633 UnicodeSet& applyPattern(const UnicodeString& pattern,
00634 UErrorCode& status);
00635
00636 #ifndef U_HIDE_INTERNAL_API
00637
00653 UnicodeSet& applyPattern(const UnicodeString& pattern,
00654 uint32_t options,
00655 const SymbolTable* symbols,
00656 UErrorCode& status);
00657 #endif
00658
00690 UnicodeSet& applyPattern(const UnicodeString& pattern,
00691 ParsePosition& pos,
00692 uint32_t options,
00693 const SymbolTable* symbols,
00694 UErrorCode& status);
00695
00709 virtual UnicodeString& toPattern(UnicodeString& result,
00710 UBool escapeUnprintable = FALSE) const;
00711
00734 UnicodeSet& applyIntPropertyValue(UProperty prop,
00735 int32_t value,
00736 UErrorCode& ec);
00737
00767 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
00768 const UnicodeString& value,
00769 UErrorCode& ec);
00770
00779 virtual int32_t size(void) const;
00780
00787 virtual UBool isEmpty(void) const;
00788
00796 virtual UBool contains(UChar32 c) const;
00797
00806 virtual UBool contains(UChar32 start, UChar32 end) const;
00807
00815 UBool contains(const UnicodeString& s) const;
00816
00824 virtual UBool containsAll(const UnicodeSet& c) const;
00825
00833 UBool containsAll(const UnicodeString& s) const;
00834
00843 UBool containsNone(UChar32 start, UChar32 end) const;
00844
00852 UBool containsNone(const UnicodeSet& c) const;
00853
00861 UBool containsNone(const UnicodeString& s) const;
00862
00871 inline UBool containsSome(UChar32 start, UChar32 end) const;
00872
00880 inline UBool containsSome(const UnicodeSet& s) const;
00881
00889 inline UBool containsSome(const UnicodeString& s) const;
00890
00909 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
00910
00923 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
00924
00942 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
00943
00957 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
00958
00977 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00978
00996 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00997
01002 virtual UMatchDegree matches(const Replaceable& text,
01003 int32_t& offset,
01004 int32_t limit,
01005 UBool incremental);
01006
01007 private:
01030 static int32_t matchRest(const Replaceable& text,
01031 int32_t start, int32_t limit,
01032 const UnicodeString& s);
01033
01043 int32_t findCodePoint(UChar32 c) const;
01044
01045 public:
01046
01054 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
01055
01064 int32_t indexOf(UChar32 c) const;
01065
01075 UChar32 charAt(int32_t index) const;
01076
01091 virtual UnicodeSet& add(UChar32 start, UChar32 end);
01092
01100 UnicodeSet& add(UChar32 c);
01101
01113 UnicodeSet& add(const UnicodeString& s);
01114
01115 private:
01121 static int32_t getSingleCP(const UnicodeString& s);
01122
01123 void _add(const UnicodeString& s);
01124
01125 public:
01134 UnicodeSet& addAll(const UnicodeString& s);
01135
01144 UnicodeSet& retainAll(const UnicodeString& s);
01145
01154 UnicodeSet& complementAll(const UnicodeString& s);
01155
01164 UnicodeSet& removeAll(const UnicodeString& s);
01165
01174 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
01175
01176
01184 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
01185
01199 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
01200
01201
01207 UnicodeSet& retain(UChar32 c);
01208
01222 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
01223
01231 UnicodeSet& remove(UChar32 c);
01232
01242 UnicodeSet& remove(const UnicodeString& s);
01243
01251 virtual UnicodeSet& complement(void);
01252
01267 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
01268
01276 UnicodeSet& complement(UChar32 c);
01277
01288 UnicodeSet& complement(const UnicodeString& s);
01289
01302 virtual UnicodeSet& addAll(const UnicodeSet& c);
01303
01315 virtual UnicodeSet& retainAll(const UnicodeSet& c);
01316
01328 virtual UnicodeSet& removeAll(const UnicodeSet& c);
01329
01340 virtual UnicodeSet& complementAll(const UnicodeSet& c);
01341
01348 virtual UnicodeSet& clear(void);
01349
01375 UnicodeSet& closeOver(int32_t attribute);
01376
01383 virtual UnicodeSet &removeAllStrings();
01384
01392 virtual int32_t getRangeCount(void) const;
01393
01401 virtual UChar32 getRangeStart(int32_t index) const;
01402
01410 virtual UChar32 getRangeEnd(int32_t index) const;
01411
01460 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
01461
01468 virtual UnicodeSet& compact();
01469
01481 static UClassID U_EXPORT2 getStaticClassID(void);
01482
01491 virtual UClassID getDynamicClassID(void) const;
01492
01493 private:
01494
01495
01496
01497 friend class USetAccess;
01498
01499 const UnicodeString* getString(int32_t index) const;
01500
01501
01502
01503
01504
01505 private:
01506
01512 virtual UBool matchesIndexValue(uint8_t v) const;
01513
01514 private:
01515 friend class RBBIRuleScanner;
01516
01517
01518
01519
01520
01521 UnicodeSet(const UnicodeSet& o, UBool );
01522 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
01523
01524
01525
01526
01527
01528 void applyPatternIgnoreSpace(const UnicodeString& pattern,
01529 ParsePosition& pos,
01530 const SymbolTable* symbols,
01531 UErrorCode& status);
01532
01533 void applyPattern(RuleCharacterIterator& chars,
01534 const SymbolTable* symbols,
01535 UnicodeString& rebuiltPat,
01536 uint32_t options,
01537 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
01538 int32_t depth,
01539 UErrorCode& ec);
01540
01541
01542
01543
01544
01545 static int32_t nextCapacity(int32_t minCapacity);
01546
01547 bool ensureCapacity(int32_t newLen);
01548
01549 bool ensureBufferCapacity(int32_t newLen);
01550
01551 void swapBuffers(void);
01552
01553 UBool allocateStrings(UErrorCode &status);
01554 UBool hasStrings() const;
01555 int32_t stringsSize() const;
01556 UBool stringsContains(const UnicodeString &s) const;
01557
01558 UnicodeString& _toPattern(UnicodeString& result,
01559 UBool escapeUnprintable) const;
01560
01561 UnicodeString& _generatePattern(UnicodeString& result,
01562 UBool escapeUnprintable) const;
01563
01564 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
01565
01566 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
01567
01568
01569
01570
01571
01572 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
01573
01574 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
01575
01576 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
01577
01583 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
01584 int32_t pos);
01585
01586 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
01587 int32_t iterOpts);
01588
01628 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
01629 ParsePosition& ppos,
01630 UErrorCode &ec);
01631
01632 void applyPropertyPattern(RuleCharacterIterator& chars,
01633 UnicodeString& rebuiltPat,
01634 UErrorCode& ec);
01635
01636 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
01637
01642 typedef UBool (*Filter)(UChar32 codePoint, void* context);
01643
01653 void applyFilter(Filter filter,
01654 void* context,
01655 const UnicodeSet* inclusions,
01656 UErrorCode &status);
01657
01658
01659 void applyIntPropertyValue(const UCPMap *map,
01660 UCPMapValueFilter *filter, const void *context,
01661 UErrorCode &errorCode);
01662
01666 void setPattern(const UnicodeString& newPat) {
01667 setPattern(newPat.getBuffer(), newPat.length());
01668 }
01669 void setPattern(const char16_t *newPat, int32_t newPatLen);
01673 void releasePattern();
01674
01675 friend class UnicodeSetIterator;
01676 };
01677
01678
01679
01680 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
01681 return !operator==(o);
01682 }
01683
01684 inline UBool UnicodeSet::isFrozen() const {
01685 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
01686 }
01687
01688 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
01689 return !containsNone(start, end);
01690 }
01691
01692 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
01693 return !containsNone(s);
01694 }
01695
01696 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
01697 return !containsNone(s);
01698 }
01699
01700 inline UBool UnicodeSet::isBogus() const {
01701 return (UBool)(fFlags & kIsBogus);
01702 }
01703
01704 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
01705 return reinterpret_cast<UnicodeSet *>(uset);
01706 }
01707
01708 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
01709 return reinterpret_cast<const UnicodeSet *>(uset);
01710 }
01711
01712 inline USet *UnicodeSet::toUSet() {
01713 return reinterpret_cast<USet *>(this);
01714 }
01715
01716 inline const USet *UnicodeSet::toUSet() const {
01717 return reinterpret_cast<const USet *>(this);
01718 }
01719
01720 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
01721 int32_t sLength=s.length();
01722 if(start<0) {
01723 start=0;
01724 } else if(start>sLength) {
01725 start=sLength;
01726 }
01727 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
01728 }
01729
01730 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
01731 int32_t sLength=s.length();
01732 if(limit<0) {
01733 limit=0;
01734 } else if(limit>sLength) {
01735 limit=sLength;
01736 }
01737 return spanBack(s.getBuffer(), limit, spanCondition);
01738 }
01739
01740 U_NAMESPACE_END
01741
01742 #endif
01743
01744 #endif