sbmh.js 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. /*
  2. Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
  3. by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
  4. */
  5. var EventEmitter = require('events').EventEmitter,
  6. inherits = require('util').inherits;
  7. function jsmemcmp(buf1, pos1, buf2, pos2, num) {
  8. for (var i = 0; i < num; ++i, ++pos1, ++pos2)
  9. if (buf1[pos1] !== buf2[pos2])
  10. return false;
  11. return true;
  12. }
  13. function SBMH(needle) {
  14. if (typeof needle === 'string')
  15. needle = new Buffer(needle);
  16. var i, j, needle_len = needle.length;
  17. this.maxMatches = Infinity;
  18. this.matches = 0;
  19. this._occ = new Array(256);
  20. this._lookbehind_size = 0;
  21. this._needle = needle;
  22. this._bufpos = 0;
  23. this._lookbehind = new Buffer(needle_len);
  24. // Initialize occurrence table.
  25. for (j = 0; j < 256; ++j)
  26. this._occ[j] = needle_len;
  27. // Populate occurrence table with analysis of the needle,
  28. // ignoring last letter.
  29. if (needle_len >= 1) {
  30. for (i = 0; i < needle_len - 1; ++i)
  31. this._occ[needle[i]] = needle_len - 1 - i;
  32. }
  33. }
  34. inherits(SBMH, EventEmitter);
  35. SBMH.prototype.reset = function() {
  36. this._lookbehind_size = 0;
  37. this.matches = 0;
  38. this._bufpos = 0;
  39. };
  40. SBMH.prototype.push = function(chunk, pos) {
  41. var r, chlen;
  42. if (!Buffer.isBuffer(chunk))
  43. chunk = new Buffer(chunk, 'binary');
  44. chlen = chunk.length;
  45. this._bufpos = pos || 0;
  46. while (r !== chlen && this.matches < this.maxMatches)
  47. r = this._sbmh_feed(chunk);
  48. return r;
  49. };
  50. SBMH.prototype._sbmh_feed = function(data) {
  51. var len = data.length, needle = this._needle, needle_len = needle.length;
  52. // Positive: points to a position in `data`
  53. // pos == 3 points to data[3]
  54. // Negative: points to a position in the lookbehind buffer
  55. // pos == -2 points to lookbehind[lookbehind_size - 2]
  56. var pos = -this._lookbehind_size,
  57. last_needle_char = needle[needle_len - 1],
  58. occ = this._occ,
  59. lookbehind = this._lookbehind;
  60. if (pos < 0) {
  61. // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
  62. // search with character lookup code that considers both the
  63. // lookbehind buffer and the current round's haystack data.
  64. //
  65. // Loop until
  66. // there is a match.
  67. // or until
  68. // we've moved past the position that requires the
  69. // lookbehind buffer. In this case we switch to the
  70. // optimized loop.
  71. // or until
  72. // the character to look at lies outside the haystack.
  73. while (pos < 0 && pos <= len - needle_len) {
  74. var ch = this._sbmh_lookup_char(data, pos + needle_len - 1);
  75. if (ch === last_needle_char
  76. && this._sbmh_memcmp(data, pos, needle_len - 1)) {
  77. this._lookbehind_size = 0;
  78. ++this.matches;
  79. if (pos > -this._lookbehind_size)
  80. this.emit('info', true, lookbehind, 0, this._lookbehind_size + pos);
  81. else
  82. this.emit('info', true);
  83. this._bufpos = pos + needle_len;
  84. return pos + needle_len;
  85. } else
  86. pos += occ[ch];
  87. }
  88. // No match.
  89. if (pos < 0) {
  90. // There's too few data for Boyer-Moore-Horspool to run,
  91. // so let's use a different algorithm to skip as much as
  92. // we can.
  93. // Forward pos until
  94. // the trailing part of lookbehind + data
  95. // looks like the beginning of the needle
  96. // or until
  97. // pos == 0
  98. while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos))
  99. pos++;
  100. }
  101. if (pos >= 0) {
  102. // Discard lookbehind buffer.
  103. this.emit('info', false, lookbehind, 0, this._lookbehind_size);
  104. this._lookbehind_size = 0;
  105. } else {
  106. // Cut off part of the lookbehind buffer that has
  107. // been processed and append the entire haystack
  108. // into it.
  109. var bytesToCutOff = this._lookbehind_size + pos;
  110. if (bytesToCutOff > 0) {
  111. // The cut off data is guaranteed not to contain the needle.
  112. this.emit('info', false, lookbehind, 0, bytesToCutOff);
  113. }
  114. lookbehind.copy(lookbehind, 0, bytesToCutOff,
  115. this._lookbehind_size - bytesToCutOff);
  116. this._lookbehind_size -= bytesToCutOff;
  117. data.copy(lookbehind, this._lookbehind_size);
  118. this._lookbehind_size += len;
  119. this._bufpos = len;
  120. return len;
  121. }
  122. }
  123. if (pos >= 0)
  124. pos += this._bufpos;
  125. // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
  126. // search with optimized character lookup code that only considers
  127. // the current round's haystack data.
  128. while (pos <= len - needle_len) {
  129. var ch = data[pos + needle_len - 1];
  130. if (ch === last_needle_char
  131. && data[pos] === needle[0]
  132. && jsmemcmp(needle, 0, data, pos, needle_len - 1)) {
  133. ++this.matches;
  134. if (pos > 0)
  135. this.emit('info', true, data, this._bufpos, pos);
  136. else
  137. this.emit('info', true);
  138. this._bufpos = pos + needle_len;
  139. return pos + needle_len;
  140. } else
  141. pos += occ[ch];
  142. }
  143. // There was no match. If there's trailing haystack data that we cannot
  144. // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
  145. // data is less than the needle size) then match using a modified
  146. // algorithm that starts matching from the beginning instead of the end.
  147. // Whatever trailing data is left after running this algorithm is added to
  148. // the lookbehind buffer.
  149. if (pos < len) {
  150. while (pos < len && (data[pos] !== needle[0]
  151. || !jsmemcmp(data, pos, needle, 0, len - pos))) {
  152. ++pos;
  153. }
  154. if (pos < len) {
  155. data.copy(lookbehind, 0, pos, pos + (len - pos));
  156. this._lookbehind_size = len - pos;
  157. }
  158. }
  159. // Everything until pos is guaranteed not to contain needle data.
  160. if (pos > 0)
  161. this.emit('info', false, data, this._bufpos, pos < len ? pos : len);
  162. this._bufpos = len;
  163. return len;
  164. };
  165. SBMH.prototype._sbmh_lookup_char = function(data, pos) {
  166. if (pos < 0)
  167. return this._lookbehind[this._lookbehind_size + pos];
  168. else
  169. return data[pos];
  170. }
  171. SBMH.prototype._sbmh_memcmp = function(data, pos, len) {
  172. var i = 0;
  173. while (i < len) {
  174. if (this._sbmh_lookup_char(data, pos + i) === this._needle[i])
  175. ++i;
  176. else
  177. return false;
  178. }
  179. return true;
  180. }
  181. module.exports = SBMH;