00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifndef __BYTESTRIE_H__
00018 #define __BYTESTRIE_H__
00019
00025 #include "unicode/utypes.h"
00026
00027 #if U_SHOW_CPLUSPLUS_API
00028
00029 #include "unicode/stringpiece.h"
00030 #include "unicode/uobject.h"
00031 #include "unicode/ustringtrie.h"
00032
00033 U_NAMESPACE_BEGIN
00034
00035 class ByteSink;
00036 class BytesTrieBuilder;
00037 class CharString;
00038 class UVector32;
00039
00053 class U_COMMON_API BytesTrie : public UMemory {
00054 public:
00069 BytesTrie(const void *trieBytes)
00070 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
00071 pos_(bytes_), remainingMatchLength_(-1) {}
00072
00077 ~BytesTrie();
00078
00085 BytesTrie(const BytesTrie &other)
00086 : ownedArray_(NULL), bytes_(other.bytes_),
00087 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
00088
00094 BytesTrie &reset() {
00095 pos_=bytes_;
00096 remainingMatchLength_=-1;
00097 return *this;
00098 }
00099
00100 #ifndef U_HIDE_DRAFT_API
00101
00109 uint64_t getState64() const {
00110 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
00111 (uint64_t)(pos_ - bytes_);
00112 }
00113
00128 BytesTrie &resetToState64(uint64_t state) {
00129 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
00130 pos_ = bytes_ + (state & kState64PosMask);
00131 return *this;
00132 }
00133 #endif
00134
00140 class State : public UMemory {
00141 public:
00146 State() { bytes=NULL; }
00147 private:
00148 friend class BytesTrie;
00149
00150 const uint8_t *bytes;
00151 const uint8_t *pos;
00152 int32_t remainingMatchLength;
00153 };
00154
00162 const BytesTrie &saveState(State &state) const {
00163 state.bytes=bytes_;
00164 state.pos=pos_;
00165 state.remainingMatchLength=remainingMatchLength_;
00166 return *this;
00167 }
00168
00179 BytesTrie &resetToState(const State &state) {
00180 if(bytes_==state.bytes && bytes_!=NULL) {
00181 pos_=state.pos;
00182 remainingMatchLength_=state.remainingMatchLength;
00183 }
00184 return *this;
00185 }
00186
00193 UStringTrieResult current() const;
00194
00203 inline UStringTrieResult first(int32_t inByte) {
00204 remainingMatchLength_=-1;
00205 if(inByte<0) {
00206 inByte+=0x100;
00207 }
00208 return nextImpl(bytes_, inByte);
00209 }
00210
00218 UStringTrieResult next(int32_t inByte);
00219
00235 UStringTrieResult next(const char *s, int32_t length);
00236
00246 inline int32_t getValue() const {
00247 const uint8_t *pos=pos_;
00248 int32_t leadByte=*pos++;
00249
00250 return readValue(pos, leadByte>>1);
00251 }
00252
00262 inline UBool hasUniqueValue(int32_t &uniqueValue) const {
00263 const uint8_t *pos=pos_;
00264
00265 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
00266 }
00267
00276 int32_t getNextBytes(ByteSink &out) const;
00277
00282 class U_COMMON_API Iterator : public UMemory {
00283 public:
00295 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
00296
00308 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
00309
00314 ~Iterator();
00315
00321 Iterator &reset();
00322
00327 UBool hasNext() const;
00328
00343 UBool next(UErrorCode &errorCode);
00344
00349 StringPiece getString() const;
00354 int32_t getValue() const { return value_; }
00355
00356 private:
00357 UBool truncateAndStop();
00358
00359 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
00360
00361 const uint8_t *bytes_;
00362 const uint8_t *pos_;
00363 const uint8_t *initialPos_;
00364 int32_t remainingMatchLength_;
00365 int32_t initialRemainingMatchLength_;
00366
00367 CharString *str_;
00368 int32_t maxLength_;
00369 int32_t value_;
00370
00371
00372
00373
00374
00375
00376
00377
00378 UVector32 *stack_;
00379 };
00380
00381 private:
00382 friend class BytesTrieBuilder;
00383
00390 BytesTrie(void *adoptBytes, const void *trieBytes)
00391 : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
00392 bytes_(static_cast<const uint8_t *>(trieBytes)),
00393 pos_(bytes_), remainingMatchLength_(-1) {}
00394
00395
00396 BytesTrie &operator=(const BytesTrie &other);
00397
00398 inline void stop() {
00399 pos_=NULL;
00400 }
00401
00402
00403
00404 static int32_t readValue(const uint8_t *pos, int32_t leadByte);
00405 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
00406
00407 if(leadByte>=(kMinTwoByteValueLead<<1)) {
00408 if(leadByte<(kMinThreeByteValueLead<<1)) {
00409 ++pos;
00410 } else if(leadByte<(kFourByteValueLead<<1)) {
00411 pos+=2;
00412 } else {
00413 pos+=3+((leadByte>>1)&1);
00414 }
00415 }
00416 return pos;
00417 }
00418 static inline const uint8_t *skipValue(const uint8_t *pos) {
00419 int32_t leadByte=*pos++;
00420 return skipValue(pos, leadByte);
00421 }
00422
00423
00424 static const uint8_t *jumpByDelta(const uint8_t *pos);
00425
00426 static inline const uint8_t *skipDelta(const uint8_t *pos) {
00427 int32_t delta=*pos++;
00428 if(delta>=kMinTwoByteDeltaLead) {
00429 if(delta<kMinThreeByteDeltaLead) {
00430 ++pos;
00431 } else if(delta<kFourByteDeltaLead) {
00432 pos+=2;
00433 } else {
00434 pos+=3+(delta&1);
00435 }
00436 }
00437 return pos;
00438 }
00439
00440 static inline UStringTrieResult valueResult(int32_t node) {
00441 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
00442 }
00443
00444
00445 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
00446
00447
00448 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
00449
00450
00451
00452
00453 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
00454 UBool haveUniqueValue, int32_t &uniqueValue);
00455
00456
00457 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
00458
00459
00460
00461 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
00462 static void append(ByteSink &out, int c);
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503 static const int32_t kMaxBranchLinearSubNodeLength=5;
00504
00505
00506 static const int32_t kMinLinearMatch=0x10;
00507 static const int32_t kMaxLinearMatchLength=0x10;
00508
00509
00510
00511
00512
00513
00514 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;
00515
00516 static const int32_t kValueIsFinal=1;
00517
00518
00519 static const int32_t kMinOneByteValueLead=kMinValueLead/2;
00520 static const int32_t kMaxOneByteValue=0x40;
00521
00522 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;
00523 static const int32_t kMaxTwoByteValue=0x1aff;
00524
00525 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;
00526 static const int32_t kFourByteValueLead=0x7e;
00527
00528
00529 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
00530
00531 static const int32_t kFiveByteValueLead=0x7f;
00532
00533
00534 static const int32_t kMaxOneByteDelta=0xbf;
00535 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;
00536 static const int32_t kMinThreeByteDeltaLead=0xf0;
00537 static const int32_t kFourByteDeltaLead=0xfe;
00538 static const int32_t kFiveByteDeltaLead=0xff;
00539
00540 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;
00541 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;
00542
00543
00544
00545
00546
00547 static constexpr int32_t kState64RemainingShift = 59;
00548 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
00549
00550 uint8_t *ownedArray_;
00551
00552
00553 const uint8_t *bytes_;
00554
00555
00556
00557
00558 const uint8_t *pos_;
00559
00560 int32_t remainingMatchLength_;
00561 };
00562
00563 U_NAMESPACE_END
00564
00565 #endif
00566
00567 #endif // __BYTESTRIE_H__