ICU 77.1  77.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/utypes.h"
17 
18 #if U_SHOW_CPLUSPLUS_API
19 
20 #include "unicode/ucpmap.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uset.h"
24 
30 U_NAMESPACE_BEGIN
31 
32 // Forward Declarations.
33 class BMPSet;
34 class ParsePosition;
35 class RBBIRuleScanner;
36 class SymbolTable;
37 class UnicodeSetStringSpan;
38 class UVector;
39 class RuleCharacterIterator;
40 
285 class U_COMMON_API UnicodeSet final : public UnicodeFilter {
286 private:
291  static constexpr int32_t INITIAL_CAPACITY = 25;
292  // fFlags constant
293  static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
294 
295  UChar32* list = stackList; // MUST be terminated with HIGH
296  int32_t capacity = INITIAL_CAPACITY; // capacity of list
297  int32_t len = 1; // length of list used; 1 <= len <= capacity
298  uint8_t fFlags = 0; // Bit flag (see constants above)
299 
300  BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr.
301  UChar32* buffer = nullptr; // internal buffer, may be nullptr
302  int32_t bufferCapacity = 0; // capacity of buffer
303 
313  char16_t *pat = nullptr;
314  int32_t patLen = 0;
315 
316  UVector* strings_ = nullptr; // maintained in sorted order
317  UnicodeSetStringSpan *stringSpan = nullptr;
318 
324  UChar32 stackList[INITIAL_CAPACITY];
325 
326 public:
336  inline UBool isBogus() const;
337 
354  void setToBogus();
355 
356 public:
357 
358  enum {
363  MIN_VALUE = 0,
364 
369  MAX_VALUE = 0x10ffff
370  };
371 
372  //----------------------------------------------------------------
373  // Constructors &c
374  //----------------------------------------------------------------
375 
376 public:
377 
382  UnicodeSet();
383 
392  UnicodeSet(UChar32 start, UChar32 end);
393 
394 #ifndef U_HIDE_INTERNAL_API
395 
399  kSerialized /* result of serialize() */
400  };
401 
412  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413  ESerialization serialization, UErrorCode &status);
414 #endif /* U_HIDE_INTERNAL_API */
415 
424  UnicodeSet(const UnicodeString& pattern,
425  UErrorCode& status);
426 
427 #ifndef U_HIDE_INTERNAL_API
428 
442  UnicodeSet(const UnicodeString& pattern,
443  uint32_t options,
444  const SymbolTable* symbols,
445  UErrorCode& status);
446 #endif /* U_HIDE_INTERNAL_API */
447 
463  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
464  uint32_t options,
465  const SymbolTable* symbols,
466  UErrorCode& status);
467 
472  UnicodeSet(const UnicodeSet& o);
473 
478  virtual ~UnicodeSet();
479 
485  UnicodeSet& operator=(const UnicodeSet& o);
486 
498  virtual bool operator==(const UnicodeSet& o) const;
499 
505  inline bool operator!=(const UnicodeSet& o) const;
506 
516  virtual UnicodeSet* clone() const override;
517 
525  virtual int32_t hashCode() const;
526 
535  inline static UnicodeSet *fromUSet(USet *uset);
536 
545  inline static const UnicodeSet *fromUSet(const USet *uset);
546 
554  inline USet *toUSet();
555 
556 
564  inline const USet * toUSet() const;
565 
566 
567  //----------------------------------------------------------------
568  // Freezable API
569  //----------------------------------------------------------------
570 
579  inline UBool isFrozen() const;
580 
594  UnicodeSet *freeze();
595 
604  UnicodeSet *cloneAsThawed() const;
605 
606  //----------------------------------------------------------------
607  // Public API
608  //----------------------------------------------------------------
609 
619  UnicodeSet& set(UChar32 start, UChar32 end);
620 
626  static UBool resemblesPattern(const UnicodeString& pattern,
627  int32_t pos);
628 
641  UnicodeSet& applyPattern(const UnicodeString& pattern,
642  UErrorCode& status);
643 
644 #ifndef U_HIDE_INTERNAL_API
645 
663  UnicodeSet& applyPattern(const UnicodeString& pattern,
664  uint32_t options,
665  const SymbolTable* symbols,
666  UErrorCode& status);
667 #endif /* U_HIDE_INTERNAL_API */
668 
702  UnicodeSet& applyPattern(const UnicodeString& pattern,
703  ParsePosition& pos,
704  uint32_t options,
705  const SymbolTable* symbols,
706  UErrorCode& status);
707 
721  virtual UnicodeString& toPattern(UnicodeString& result,
722  UBool escapeUnprintable = false) const override;
723 
746  UnicodeSet& applyIntPropertyValue(UProperty prop,
747  int32_t value,
748  UErrorCode& ec);
749 
779  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
780  const UnicodeString& value,
781  UErrorCode& ec);
782 
795  virtual int32_t size() const;
796 
803  virtual UBool isEmpty() const;
804 
809  UBool hasStrings() const;
810 
818  virtual UBool contains(UChar32 c) const override;
819 
828  virtual UBool contains(UChar32 start, UChar32 end) const;
829 
837  UBool contains(const UnicodeString& s) const;
838 
846  virtual UBool containsAll(const UnicodeSet& c) const;
847 
855  UBool containsAll(const UnicodeString& s) const;
856 
865  UBool containsNone(UChar32 start, UChar32 end) const;
866 
874  UBool containsNone(const UnicodeSet& c) const;
875 
883  UBool containsNone(const UnicodeString& s) const;
884 
893  inline UBool containsSome(UChar32 start, UChar32 end) const;
894 
902  inline UBool containsSome(const UnicodeSet& s) const;
903 
911  inline UBool containsSome(const UnicodeString& s) const;
912 
931  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
932 
945  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
946 
964  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
965 
979  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
980 
999  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1000 
1018  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1019 
1024  virtual UMatchDegree matches(const Replaceable& text,
1025  int32_t& offset,
1026  int32_t limit,
1027  UBool incremental) override;
1028 
1029 private:
1052  static int32_t matchRest(const Replaceable& text,
1053  int32_t start, int32_t limit,
1054  const UnicodeString& s);
1055 
1065  int32_t findCodePoint(UChar32 c) const;
1066 
1067 public:
1068 
1076  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1077 
1086  int32_t indexOf(UChar32 c) const;
1087 
1103  UChar32 charAt(int32_t index) const;
1104 
1105 #ifndef U_HIDE_DRAFT_API
1106 
1123  inline U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const {
1124  return U_HEADER_NESTED_NAMESPACE::USetCodePoints(toUSet());
1125  }
1126 
1149  inline U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const {
1150  return U_HEADER_NESTED_NAMESPACE::USetRanges(toUSet());
1151  }
1152 
1173  inline U_HEADER_NESTED_NAMESPACE::USetStrings strings() const {
1174  return U_HEADER_NESTED_NAMESPACE::USetStrings(toUSet());
1175  }
1176 #endif // U_HIDE_DRAFT_API
1177 
1178 #ifndef U_HIDE_DRAFT_API
1179 
1203  inline U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const {
1204  return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).begin();
1205  }
1206 
1215  inline U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const {
1216  return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).end();
1217  }
1218 #endif // U_HIDE_DRAFT_API
1219 
1234  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1235 
1246  UnicodeSet& add(UChar32 c);
1247 
1259  UnicodeSet& add(const UnicodeString& s);
1260 
1261  private:
1267  static int32_t getSingleCP(const UnicodeString& s);
1268 
1269  void _add(const UnicodeString& s);
1270 
1271  public:
1280  UnicodeSet& addAll(const UnicodeString& s);
1281 
1289  UnicodeSet& retainAll(const UnicodeString& s);
1290 
1298  UnicodeSet& complementAll(const UnicodeString& s);
1299 
1307  UnicodeSet& removeAll(const UnicodeString& s);
1308 
1317  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1318 
1319 
1327  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1328 
1340  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1341 
1342 
1351  UnicodeSet& retain(UChar32 c);
1352 
1363  UnicodeSet& retain(const UnicodeString &s);
1364 
1378  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1379 
1390  UnicodeSet& remove(UChar32 c);
1391 
1401  UnicodeSet& remove(const UnicodeString& s);
1402 
1415  virtual UnicodeSet& complement();
1416 
1429  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1430 
1441  UnicodeSet& complement(UChar32 c);
1442 
1452  UnicodeSet& complement(const UnicodeString& s);
1453 
1466  virtual UnicodeSet& addAll(const UnicodeSet& c);
1467 
1479  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1480 
1492  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1493 
1504  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1505 
1512  virtual UnicodeSet& clear();
1513 
1541  UnicodeSet& closeOver(int32_t attribute);
1542 
1549  virtual UnicodeSet &removeAllStrings();
1550 
1558  virtual int32_t getRangeCount() const;
1559 
1567  virtual UChar32 getRangeStart(int32_t index) const;
1568 
1576  virtual UChar32 getRangeEnd(int32_t index) const;
1577 
1626  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1627 
1634  virtual UnicodeSet& compact();
1635 
1647  static UClassID U_EXPORT2 getStaticClassID();
1648 
1657  virtual UClassID getDynamicClassID() const override;
1658 
1659  private:
1660 
1661  // Private API for the USet API
1662 
1663  friend class USetAccess;
1664 
1665  const UnicodeString* getString(int32_t index) const;
1666 
1667  //----------------------------------------------------------------
1668  // RuleBasedTransliterator support
1669  //----------------------------------------------------------------
1670 
1671 private:
1672 
1678  virtual UBool matchesIndexValue(uint8_t v) const override;
1679 
1680 private:
1681  friend class RBBIRuleScanner;
1682 
1683  //----------------------------------------------------------------
1684  // Implementation: Clone as thawed (see ICU4J Freezable)
1685  //----------------------------------------------------------------
1686 
1687  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1688  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1689 
1690  //----------------------------------------------------------------
1691  // Implementation: Pattern parsing
1692  //----------------------------------------------------------------
1693 
1694  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1695  ParsePosition& pos,
1696  const SymbolTable* symbols,
1697  UErrorCode& status);
1698 
1699  void applyPattern(RuleCharacterIterator& chars,
1700  const SymbolTable* symbols,
1701  UnicodeString& rebuiltPat,
1702  uint32_t options,
1703  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1704  int32_t depth,
1705  UErrorCode& ec);
1706 
1707  void closeOverCaseInsensitive(bool simple);
1708  void closeOverAddCaseMappings();
1709 
1710  //----------------------------------------------------------------
1711  // Implementation: Utility methods
1712  //----------------------------------------------------------------
1713 
1714  static int32_t nextCapacity(int32_t minCapacity);
1715 
1716  bool ensureCapacity(int32_t newLen);
1717 
1718  bool ensureBufferCapacity(int32_t newLen);
1719 
1720  void swapBuffers();
1721 
1722  UBool allocateStrings(UErrorCode &status);
1723  int32_t stringsSize() const;
1724  UBool stringsContains(const UnicodeString &s) const;
1725 
1726  UnicodeString& _toPattern(UnicodeString& result,
1727  UBool escapeUnprintable) const;
1728 
1729  UnicodeString& _generatePattern(UnicodeString& result,
1730  UBool escapeUnprintable) const;
1731 
1732  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1733 
1734  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1735 
1736  static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1737  UBool escapeUnprintable);
1738 
1739  //----------------------------------------------------------------
1740  // Implementation: Fundamental operators
1741  //----------------------------------------------------------------
1742 
1743  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1744 
1745  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1746 
1747  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1748 
1754  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1755  int32_t pos);
1756 
1757  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1758  int32_t iterOpts);
1759 
1799  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1800  ParsePosition& ppos,
1801  UErrorCode &ec);
1802 
1803  void applyPropertyPattern(RuleCharacterIterator& chars,
1804  UnicodeString& rebuiltPat,
1805  UErrorCode& ec);
1806 
1811  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1812 
1822  void applyFilter(Filter filter,
1823  void* context,
1824  const UnicodeSet* inclusions,
1825  UErrorCode &status);
1826 
1830  void setPattern(const UnicodeString& newPat) {
1831  setPattern(newPat.getBuffer(), newPat.length());
1832  }
1833  void setPattern(const char16_t *newPat, int32_t newPatLen);
1837  void releasePattern();
1838 
1839  friend class UnicodeSetIterator;
1840 };
1841 
1842 
1843 
1844 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1845  return !operator==(o);
1846 }
1847 
1848 inline UBool UnicodeSet::isFrozen() const {
1849  return bmpSet != nullptr || stringSpan != nullptr;
1850 }
1851 
1852 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1853  return !containsNone(start, end);
1854 }
1855 
1856 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1857  return !containsNone(s);
1858 }
1859 
1860 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1861  return !containsNone(s);
1862 }
1863 
1864 inline UBool UnicodeSet::isBogus() const {
1865  return fFlags & kIsBogus;
1866 }
1867 
1868 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1869  return reinterpret_cast<UnicodeSet *>(uset);
1870 }
1871 
1872 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1873  return reinterpret_cast<const UnicodeSet *>(uset);
1874 }
1875 
1876 inline USet *UnicodeSet::toUSet() {
1877  return reinterpret_cast<USet *>(this);
1878 }
1879 
1880 inline const USet *UnicodeSet::toUSet() const {
1881  return reinterpret_cast<const USet *>(this);
1882 }
1883 
1884 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1885  int32_t sLength=s.length();
1886  if(start<0) {
1887  start=0;
1888  } else if(start>sLength) {
1889  start=sLength;
1890  }
1891  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1892 }
1893 
1894 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1895  int32_t sLength=s.length();
1896  if(limit<0) {
1897  limit=0;
1898  } else if(limit>sLength) {
1899  limit=sLength;
1900  }
1901  return spanBack(s.getBuffer(), limit, spanCondition);
1902 }
1903 
1904 U_NAMESPACE_END
1905 
1906 #endif /* U_SHOW_CPLUSPLUS_API */
1907 
1908 #endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition: RunArrays.h:32
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:346
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental) override
Implement UnicodeMatcher API.
static UClassID getStaticClassID()
ICU &quot;poor man&#39;s RTTI&quot;, returns a UClassID for this class.
U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const
Definition: uniset.h:1215
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:33
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
C API: This file defines an abstract map from Unicode code points to integer values.
U_HEADER_NESTED_NAMESPACE::USetStrings strings() const
Returns a C++ &quot;range&quot; for iterating over the empty and multi-character strings of this set...
Definition: uniset.h:1173
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns true if this matcher will match a character c, where c &amp; 0xFF == v, at offset, in the forward direction (with limit &gt; offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:59
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=false) const =0
Returns a string representation of this matcher.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:77
U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const
Returns a C++ &quot;range&quot; for iterating over the code points of this set.
Definition: uniset.h:1123
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:65
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:427
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
virtual UClassID getDynamicClassID() const override=0
Returns a unique class ID polymorphically.
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:186
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:196
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:430
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:54
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:52
int32_t length() const
Return the length of the UnicodeString object.
Definition: unistr.h:4214
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeFilter * clone() const override=0
Clones this object polymorphically.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:315
U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const
Returns a C++ &quot;range&quot; for iterating over the code point ranges of this set.
Definition: uniset.h:1149
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
C++ API: Unicode Filter.
U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const
Returns a C++ iterator for iterating over all of the elements of this set.
Definition: uniset.h:1203
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247