00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifndef __BYTESTRIE_H__
00018 #define __BYTESTRIE_H__
00019
00025 #include "unicode/utypes.h"
00026 #include "unicode/stringpiece.h"
00027 #include "unicode/uobject.h"
00028 #include "unicode/ustringtrie.h"
00029
00030 U_NAMESPACE_BEGIN
00031
00032 class ByteSink;
00033 class BytesTrieBuilder;
00034 class CharString;
00035 class UVector32;
00036
00050 class U_COMMON_API BytesTrie : public UMemory {
00051 public:
00066 BytesTrie(const void *trieBytes)
00067 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
00068 pos_(bytes_), remainingMatchLength_(-1) {}
00069
00074 ~BytesTrie();
00075
00082 BytesTrie(const BytesTrie &other)
00083 : ownedArray_(NULL), bytes_(other.bytes_),
00084 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
00085
00091 BytesTrie &reset() {
00092 pos_=bytes_;
00093 remainingMatchLength_=-1;
00094 return *this;
00095 }
00096
00102 class State : public UMemory {
00103 public:
00108 State() { bytes=NULL; }
00109 private:
00110 friend class BytesTrie;
00111
00112 const uint8_t *bytes;
00113 const uint8_t *pos;
00114 int32_t remainingMatchLength;
00115 };
00116
00124 const BytesTrie &saveState(State &state) const {
00125 state.bytes=bytes_;
00126 state.pos=pos_;
00127 state.remainingMatchLength=remainingMatchLength_;
00128 return *this;
00129 }
00130
00141 BytesTrie &resetToState(const State &state) {
00142 if(bytes_==state.bytes && bytes_!=NULL) {
00143 pos_=state.pos;
00144 remainingMatchLength_=state.remainingMatchLength;
00145 }
00146 return *this;
00147 }
00148
00155 UStringTrieResult current() const;
00156
00165 inline UStringTrieResult first(int32_t inByte) {
00166 remainingMatchLength_=-1;
00167 if(inByte<0) {
00168 inByte+=0x100;
00169 }
00170 return nextImpl(bytes_, inByte);
00171 }
00172
00180 UStringTrieResult next(int32_t inByte);
00181
00197 UStringTrieResult next(const char *s, int32_t length);
00198
00208 inline int32_t getValue() const {
00209 const uint8_t *pos=pos_;
00210 int32_t leadByte=*pos++;
00211
00212 return readValue(pos, leadByte>>1);
00213 }
00214
00224 inline UBool hasUniqueValue(int32_t &uniqueValue) const {
00225 const uint8_t *pos=pos_;
00226
00227 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
00228 }
00229
00238 int32_t getNextBytes(ByteSink &out) const;
00239
00244 class U_COMMON_API Iterator : public UMemory {
00245 public:
00257 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
00258
00270 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
00271
00276 ~Iterator();
00277
00283 Iterator &reset();
00284
00289 UBool hasNext() const;
00290
00305 UBool next(UErrorCode &errorCode);
00306
00311 StringPiece getString() const;
00316 int32_t getValue() const { return value_; }
00317
00318 private:
00319 UBool truncateAndStop();
00320
00321 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
00322
00323 const uint8_t *bytes_;
00324 const uint8_t *pos_;
00325 const uint8_t *initialPos_;
00326 int32_t remainingMatchLength_;
00327 int32_t initialRemainingMatchLength_;
00328
00329 CharString *str_;
00330 int32_t maxLength_;
00331 int32_t value_;
00332
00333
00334
00335
00336
00337
00338
00339
00340 UVector32 *stack_;
00341 };
00342
00343 private:
00344 friend class BytesTrieBuilder;
00345
00352 BytesTrie(void *adoptBytes, const void *trieBytes)
00353 : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
00354 bytes_(static_cast<const uint8_t *>(trieBytes)),
00355 pos_(bytes_), remainingMatchLength_(-1) {}
00356
00357
00358 BytesTrie &operator=(const BytesTrie &other);
00359
00360 inline void stop() {
00361 pos_=NULL;
00362 }
00363
00364
00365
00366 static int32_t readValue(const uint8_t *pos, int32_t leadByte);
00367 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
00368
00369 if(leadByte>=(kMinTwoByteValueLead<<1)) {
00370 if(leadByte<(kMinThreeByteValueLead<<1)) {
00371 ++pos;
00372 } else if(leadByte<(kFourByteValueLead<<1)) {
00373 pos+=2;
00374 } else {
00375 pos+=3+((leadByte>>1)&1);
00376 }
00377 }
00378 return pos;
00379 }
00380 static inline const uint8_t *skipValue(const uint8_t *pos) {
00381 int32_t leadByte=*pos++;
00382 return skipValue(pos, leadByte);
00383 }
00384
00385
00386 static const uint8_t *jumpByDelta(const uint8_t *pos);
00387
00388 static inline const uint8_t *skipDelta(const uint8_t *pos) {
00389 int32_t delta=*pos++;
00390 if(delta>=kMinTwoByteDeltaLead) {
00391 if(delta<kMinThreeByteDeltaLead) {
00392 ++pos;
00393 } else if(delta<kFourByteDeltaLead) {
00394 pos+=2;
00395 } else {
00396 pos+=3+(delta&1);
00397 }
00398 }
00399 return pos;
00400 }
00401
00402 static inline UStringTrieResult valueResult(int32_t node) {
00403 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
00404 }
00405
00406
00407 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
00408
00409
00410 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
00411
00412
00413
00414
00415 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
00416 UBool haveUniqueValue, int32_t &uniqueValue);
00417
00418
00419 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
00420
00421
00422
00423 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
00424 static void append(ByteSink &out, int c);
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465 static const int32_t kMaxBranchLinearSubNodeLength=5;
00466
00467
00468 static const int32_t kMinLinearMatch=0x10;
00469 static const int32_t kMaxLinearMatchLength=0x10;
00470
00471
00472
00473
00474
00475
00476 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;
00477
00478 static const int32_t kValueIsFinal=1;
00479
00480
00481 static const int32_t kMinOneByteValueLead=kMinValueLead/2;
00482 static const int32_t kMaxOneByteValue=0x40;
00483
00484 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;
00485 static const int32_t kMaxTwoByteValue=0x1aff;
00486
00487 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;
00488 static const int32_t kFourByteValueLead=0x7e;
00489
00490
00491 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
00492
00493 static const int32_t kFiveByteValueLead=0x7f;
00494
00495
00496 static const int32_t kMaxOneByteDelta=0xbf;
00497 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;
00498 static const int32_t kMinThreeByteDeltaLead=0xf0;
00499 static const int32_t kFourByteDeltaLead=0xfe;
00500 static const int32_t kFiveByteDeltaLead=0xff;
00501
00502 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;
00503 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;
00504
00505 uint8_t *ownedArray_;
00506
00507
00508 const uint8_t *bytes_;
00509
00510
00511
00512
00513 const uint8_t *pos_;
00514
00515 int32_t remainingMatchLength_;
00516 };
00517
00518 U_NAMESPACE_END
00519
00520 #endif // __BYTESTRIE_H__