123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- /*
- Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
- by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
- */
- var EventEmitter = require('events').EventEmitter,
- inherits = require('util').inherits;
- function jsmemcmp(buf1, pos1, buf2, pos2, num) {
- for (var i = 0; i < num; ++i, ++pos1, ++pos2)
- if (buf1[pos1] !== buf2[pos2])
- return false;
- return true;
- }
- function SBMH(needle) {
- if (typeof needle === 'string')
- needle = new Buffer(needle);
- var i, j, needle_len = needle.length;
- this.maxMatches = Infinity;
- this.matches = 0;
- this._occ = new Array(256);
- this._lookbehind_size = 0;
- this._needle = needle;
- this._bufpos = 0;
- this._lookbehind = new Buffer(needle_len);
- // Initialize occurrence table.
- for (j = 0; j < 256; ++j)
- this._occ[j] = needle_len;
- // Populate occurrence table with analysis of the needle,
- // ignoring last letter.
- if (needle_len >= 1) {
- for (i = 0; i < needle_len - 1; ++i)
- this._occ[needle[i]] = needle_len - 1 - i;
- }
- }
- inherits(SBMH, EventEmitter);
- SBMH.prototype.reset = function() {
- this._lookbehind_size = 0;
- this.matches = 0;
- this._bufpos = 0;
- };
- SBMH.prototype.push = function(chunk, pos) {
- var r, chlen;
- if (!Buffer.isBuffer(chunk))
- chunk = new Buffer(chunk, 'binary');
- chlen = chunk.length;
- this._bufpos = pos || 0;
- while (r !== chlen && this.matches < this.maxMatches)
- r = this._sbmh_feed(chunk);
- return r;
- };
- SBMH.prototype._sbmh_feed = function(data) {
- var len = data.length, needle = this._needle, needle_len = needle.length;
- // Positive: points to a position in `data`
- // pos == 3 points to data[3]
- // Negative: points to a position in the lookbehind buffer
- // pos == -2 points to lookbehind[lookbehind_size - 2]
- var pos = -this._lookbehind_size,
- last_needle_char = needle[needle_len - 1],
- occ = this._occ,
- lookbehind = this._lookbehind;
- if (pos < 0) {
- // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
- // search with character lookup code that considers both the
- // lookbehind buffer and the current round's haystack data.
- //
- // Loop until
- // there is a match.
- // or until
- // we've moved past the position that requires the
- // lookbehind buffer. In this case we switch to the
- // optimized loop.
- // or until
- // the character to look at lies outside the haystack.
- while (pos < 0 && pos <= len - needle_len) {
- var ch = this._sbmh_lookup_char(data, pos + needle_len - 1);
- if (ch === last_needle_char
- && this._sbmh_memcmp(data, pos, needle_len - 1)) {
- this._lookbehind_size = 0;
- ++this.matches;
- if (pos > -this._lookbehind_size)
- this.emit('info', true, lookbehind, 0, this._lookbehind_size + pos);
- else
- this.emit('info', true);
- this._bufpos = pos + needle_len;
- return pos + needle_len;
- } else
- pos += occ[ch];
- }
- // No match.
- if (pos < 0) {
- // There's too few data for Boyer-Moore-Horspool to run,
- // so let's use a different algorithm to skip as much as
- // we can.
- // Forward pos until
- // the trailing part of lookbehind + data
- // looks like the beginning of the needle
- // or until
- // pos == 0
- while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos))
- pos++;
- }
- if (pos >= 0) {
- // Discard lookbehind buffer.
- this.emit('info', false, lookbehind, 0, this._lookbehind_size);
- this._lookbehind_size = 0;
- } else {
- // Cut off part of the lookbehind buffer that has
- // been processed and append the entire haystack
- // into it.
- var bytesToCutOff = this._lookbehind_size + pos;
- if (bytesToCutOff > 0) {
- // The cut off data is guaranteed not to contain the needle.
- this.emit('info', false, lookbehind, 0, bytesToCutOff);
- }
- lookbehind.copy(lookbehind, 0, bytesToCutOff,
- this._lookbehind_size - bytesToCutOff);
- this._lookbehind_size -= bytesToCutOff;
- data.copy(lookbehind, this._lookbehind_size);
- this._lookbehind_size += len;
- this._bufpos = len;
- return len;
- }
- }
- if (pos >= 0)
- pos += this._bufpos;
- // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
- // search with optimized character lookup code that only considers
- // the current round's haystack data.
- while (pos <= len - needle_len) {
- var ch = data[pos + needle_len - 1];
- if (ch === last_needle_char
- && data[pos] === needle[0]
- && jsmemcmp(needle, 0, data, pos, needle_len - 1)) {
- ++this.matches;
- if (pos > 0)
- this.emit('info', true, data, this._bufpos, pos);
- else
- this.emit('info', true);
- this._bufpos = pos + needle_len;
- return pos + needle_len;
- } else
- pos += occ[ch];
- }
- // There was no match. If there's trailing haystack data that we cannot
- // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
- // data is less than the needle size) then match using a modified
- // algorithm that starts matching from the beginning instead of the end.
- // Whatever trailing data is left after running this algorithm is added to
- // the lookbehind buffer.
- if (pos < len) {
- while (pos < len && (data[pos] !== needle[0]
- || !jsmemcmp(data, pos, needle, 0, len - pos))) {
- ++pos;
- }
- if (pos < len) {
- data.copy(lookbehind, 0, pos, pos + (len - pos));
- this._lookbehind_size = len - pos;
- }
- }
- // Everything until pos is guaranteed not to contain needle data.
- if (pos > 0)
- this.emit('info', false, data, this._bufpos, pos < len ? pos : len);
- this._bufpos = len;
- return len;
- };
- SBMH.prototype._sbmh_lookup_char = function(data, pos) {
- if (pos < 0)
- return this._lookbehind[this._lookbehind_size + pos];
- else
- return data[pos];
- }
- SBMH.prototype._sbmh_memcmp = function(data, pos, len) {
- var i = 0;
- while (i < len) {
- if (this._sbmh_lookup_char(data, pos + i) === this._needle[i])
- ++i;
- else
- return false;
- }
- return true;
- }
- module.exports = SBMH;
|