RangeCoder.hpp 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /*
  2. Copyright (c) 2009-2010 Christopher A. Taylor. All rights reserved.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions are met:
  5. * Redistributions of source code must retain the above copyright notice,
  6. this list of conditions and the following disclaimer.
  7. * Redistributions in binary form must reproduce the above copyright notice,
  8. this list of conditions and the following disclaimer in the documentation
  9. and/or other materials provided with the distribution.
  10. * Neither the name of LibCat nor the names of its contributors may be used
  11. to endorse or promote products derived from this software without
  12. specific prior written permission.
  13. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  14. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  17. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  18. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  19. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  20. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  21. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  22. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  23. POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #ifndef CAT_RANGE_CODER_HPP
  26. #define CAT_RANGE_CODER_HPP
  27. #include <cat/Platform.hpp>
  28. #include <ostream>
  29. #include <map>
  30. #include <string>
  31. namespace cat {
  32. /*
  33. TextStatsCollector
  34. Collects order-1 statistics of the text given one character at a time.
  35. Order-1 statistics include the likelihood of a character given the previous one.
  36. This is intended to be used on a large sample of text (of unlimited length)
  37. to come up with statistics that most text should follow. When the resulting
  38. table is used with a range coder, compression should be reliably achieved,
  39. though a bit should be allocated for the case where the result of the coder
  40. would be longer than encoding with uniform ranges, like if someone enters "ZZZZ".
  41. The RangeEncoder class has a text compressor based on the output of this class.
  42. I opted for a static table because this is to be run on a network server
  43. where many clients are connected. The memory needed for this type of
  44. compression does not increase with the number of clients.
  45. */
  46. class TextStatsCollector
  47. {
  48. u32 last, total, frequencies[256][256];
  49. u8 seen[256];
  50. public:
  51. TextStatsCollector();
  52. public:
  53. // Record a character occurance
  54. // 0 = end of line, so next character counts towards initial character frequency
  55. void Tally(u8 x);
  56. #if defined(CAT_PRAGMA_PACK)
  57. #pragma pack(push)
  58. #pragma pack(1)
  59. #endif
  60. struct TableFormat
  61. {
  62. // MurmurHash2 of remainder, with seed = 0
  63. u32 hash;
  64. // Total symbols in the table <= 256
  65. u16 total;
  66. // Fraction of a byte represented by total, scaled from [0, 2^15]
  67. u16 log2total;
  68. // ASCII character code -> Table index map
  69. u8 char2index[256];
  70. // Table index -> ASCII character code map
  71. u8 index2char[256];
  72. /*
  73. Start of frequency table
  74. The first 32 entries are used for reverse lookup (freq->symbol):
  75. GET_SYMBOL_LUT() will get this address:
  76. frequencies[0..15] = array of 16 bytes creating a lookup table (LUT) given
  77. the high 4 bits of the frequency, for the low range
  78. frequencies[16..31] = array of 16 bytes creating a lookup table (LUT) given
  79. the high 4 bits of the frequency, for the high range
  80. GET_SYMBOL_BASE() will get this address:
  81. frequencies[32] = cumulative frequency for (last=0, this=1) out of 2^16 trials
  82. frequencies[33] = cumulative frequency for (last=0, this=2) out of 2^16 trials
  83. frequencies[34] = ...
  84. Note: (0, 0) is implicitly zero, and (0, TOTAL) is implicitly 2^16
  85. So these tables don't include those implicit entries.
  86. */
  87. u16 frequencies[1];
  88. } CAT_PACKED;
  89. #if defined(CAT_PRAGMA_PACK)
  90. #pragma pack(pop)
  91. #endif
  92. // Returns code that creates a table in the above format
  93. bool GenerateMinimalStaticTable(const char *TableName, std::ostream &osout);
  94. // Check for errors in the in-memory version of the table that was generated
  95. static bool VerifyTableIntegrity(const TableFormat *table);
  96. };
  97. /*
  98. Range Encoder
  99. Encodes a single message one field at a time using the minimum
  100. number of bits, rounded up to the next highest byte.
  101. To insure that the message does not grow in size, provide limited
  102. space for the output buffer and check .Fail() at the end. If it
  103. failed, this means it ran out of space during encoding.
  104. If encoding succeeded, check .Used() to determine the number of
  105. bytes used by the output buffer.
  106. */
  107. class RangeEncoder
  108. {
  109. u8 *output;
  110. int limit, remaining;
  111. u64 low, range;
  112. // Write out bytes as needed
  113. CAT_INLINE void Normalize();
  114. CAT_INLINE void GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u8 ch, u16 &symbol_low, u16 &symbol_range);
  115. public:
  116. // Ctors
  117. RangeEncoder(void *output_i, int limit_i);
  118. RangeEncoder(RangeEncoder &cp);
  119. // Overwrite one context with another. Using context references instead
  120. // of working on the contexts directly is probably more efficient.
  121. RangeEncoder &operator=(RangeEncoder &cp);
  122. // State accessors
  123. bool Fail() { return output == 0; }
  124. int Used() { return limit - remaining; }
  125. public:
  126. // Encode the given text with the given statistics
  127. // NOTE: May be up to one byte longer than the original message
  128. void Text(const char *msg, const TextStatsCollector::TableFormat *stats);
  129. // Encode a biased bit given the frequency it is 0
  130. // frequency = average times out of 2^32 trials the bit will be 0
  131. void BiasedBit(u32 b, u32 frequency);
  132. // Encode a field that takes on [0, total) values with equal likelihood
  133. void Range(u32 symbol, u32 total);
  134. // Emit the final byte(s) needed to encode the symbols
  135. void Finish();
  136. };
  137. /*
  138. Range Decoder
  139. Interprets buffers produced by RangeEncoder
  140. */
  141. class RangeDecoder
  142. {
  143. const u8 *input;
  144. int remaining;
  145. u64 code, low, range;
  146. // Read in bytes as needed
  147. CAT_INLINE void Normalize();
  148. // Grab symbol low frequency and range from the table
  149. CAT_INLINE u8 GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u16 freq, u16 &symbol_low, u16 &symbol_range);
  150. public:
  151. // Initializing constructor
  152. RangeDecoder(const void *message, int bytes);
  153. int Remaining() { return remaining; }
  154. public:
  155. // Decode the given text with the given statistics
  156. int Text(char *msg, int buffer_size, const TextStatsCollector::TableFormat *stats);
  157. // Decode a biased bit given the frequency it is 0
  158. // frequency = average times out of 2^32 trials the bit will be 0
  159. u32 BiasedBit(u32 frequency);
  160. // Decode a field that takes on [0, total) values with equal likelihood
  161. u32 Range(u32 total);
  162. };
  163. } // namespace cat
  164. #endif // CAT_RANGE_CODER_HPP
粤ICP备19079148号