// boyer.cpp : implementation of the boyer class to do Boyer Moore searches
//
// Copyright (c) 1999 by Andrew W. Phillips.
//
// No restrictions are placed on the noncommercial use of this code,
// as long as this text (from the above copyright notice to the
// disclaimer below) is preserved.
//
// This code may be redistributed as long as it remains unmodified
// and is not sold for profit without the author's written consent.
//
// This code, or any part of it, may not be used in any software that
// is sold for profit, without the author's written consent.
//
// DISCLAIMER: This file is provided "as is" with no expressed or
// implied warranty. The author accepts no liability for any damage
// or loss of business that this product may cause.
//

#include "stdafx.h"
#include <string.h>

#include "boyer.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

// Converts an EBCDIC char to upper case
unsigned char e2u_tab[] =
{
        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
        0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
        0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
        0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
        0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,

        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
        0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
        0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
        0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
        0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,

        0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
        0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
        0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
        0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
        0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
        0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
        0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
        0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,

        0xc0,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
        0x88,0x89,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
        0xd0,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
        0x98,0x99,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
        0xe0,0xe1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
        0xa8,0xa9,0xea,0xeb,0xec,0xed,0xee,0xef,
        0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,
        0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,
};

unsigned char e2l_tab[] =
{
        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
        0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
        0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
        0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
        0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,

        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
        0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
        0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
        0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
        0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,

        0x80,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,
        0xc8,0xc9,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
        0x90,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,
        0xd8,0xd9,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
        0xa0,0xa1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,
        0xe8,0xe9,0xaa,0xab,0xac,0xad,0xae,0xaf,
        0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
        0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,

        0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,
        0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
        0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,
        0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
        0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,
        0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
        0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,
        0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,
};

// Normal constructor
boyer::boyer(const unsigned char *pat, size_t patlen)
{
    size_t ii;

    // Keep a copy of the pattern
    pattern_ = new unsigned char[patlen];
    memcpy(pattern_, pat, patlen);

    for (ii = 0; ii < 256; ++ii)
        fskip_[ii] = bskip_[ii] = patlen;
    for (ii = 0; ii < patlen; ++ii)
        fskip_[pat[ii]] = patlen - (ii + 1);
    for (ii = patlen; ii > 0; ii--)
        bskip_[pat[ii-1]] = ii - 1;
    pattern_len_ = patlen;
}

// Copy constructor
boyer::boyer(const boyer &from)
{
    pattern_len_ = from.pattern_len_;
    pattern_ = new unsigned char[pattern_len_];
    memcpy(pattern_, from.pattern_, pattern_len_);
    memcpy(fskip_, from.fskip_, sizeof(fskip_));
    memcpy(bskip_, from.bskip_, sizeof(bskip_));
}

// Copy assignment operator
boyer &boyer::operator=(const boyer &from)
{
    if (&from != this)
    {
        pattern_len_ = from.pattern_len_;
        pattern_ = new unsigned char[pattern_len_];
        memcpy(pattern_, from.pattern_, pattern_len_);
        memcpy(fskip_, from.fskip_, sizeof(fskip_));
        memcpy(bskip_, from.bskip_, sizeof(bskip_));
    }
    return *this;
}

boyer::~boyer()
{
    delete[] pattern_;
}

unsigned char *boyer::find(unsigned char *pp, size_t len, BOOL icase, int tt) const
{
    size_t spos = pattern_len_ - 1;     // Posn within searched bytes
    size_t patpos = pattern_len_ - 1;   // Posn within search pattern

    // Leave icase/ebcdic tests outside the loop for speed
    if (tt == 3 && icase)
    {
        while (spos < len)
        {
            if (e2u_tab[pp[spos]] != e2u_tab[pattern_[patpos]])
            {
                // Match failed so skip forward to start another
                spos += max(pattern_len_ - patpos, 
                    min(fskip_[e2u_tab[pp[spos]]], fskip_[e2l_tab[pp[spos]]]));
                patpos = pattern_len_ - 1;
            }
            else
            {
                // This characters matches - is that all of them?
                if (patpos == 0)
                    return pp + spos;

                // Go backwards one to check it next
                spos--;
                patpos--;
            }
        }
    }
    else if (icase)
    {
        // Case insensitive search (ASCII or Unicode).
        // Note: This only works for the ASCII subset of Unicode
        // (where the high byte is zero and low byte == ASCII value).
        while (spos < len)
        {
            if (toupper(pp[spos]) != toupper(pattern_[patpos]))
            {
                // Match failed so skip forward to start another
                spos += max(pattern_len_ - patpos, 
                    min(fskip_[toupper(pp[spos])], fskip_[tolower(pp[spos])]));
                patpos = pattern_len_ - 1;
            }
            else
            {
                // This characters matches - is that all of them?
                if (patpos == 0)
                    return pp + spos;

                // Go backwards one for to check it next
                spos--;
                patpos--;
            }
        }
    }
    else
    {
        // While we haven't reached the end of the buffer to search
        while (spos < len)
        {
            if (pp[spos] != pattern_[patpos])
            {
                // Match failed so skip forward to start another
                spos += max(pattern_len_ - patpos, fskip_[pp[spos]]);
                patpos = pattern_len_ - 1;
            }
            else
            {
                // This characters matches - is that all of them?
                if (patpos == 0)
                    return pp + spos;

                // Go backwards one to check it next
                spos--;
                patpos--;
            }
        }
    }
    return NULL;                // Pattern not found
}

unsigned char *boyer::findback(unsigned char *pp, size_t len, BOOL icase, int tt) const
{
    long spos = len - pattern_len_;     // Current position within search bytes
    size_t patpos = 0;                  // Current position within pattern

    // Leave icase/ebcdic tests outside the loop for speed
    if (tt == 3 && icase)
    {
        // Case insensitive EBCDIC search
        while (spos >= 0)
        {
            if (e2u_tab[pp[spos]] != e2u_tab[pattern_[patpos]])
            {
                // Match failed so skip back to start another comparison
                spos -= max(patpos + 1, 
                    min(bskip_[e2u_tab[pp[spos]]], bskip_[e2l_tab[pp[spos]]]));
                patpos = 0;
            }
            else
            {
                // One more byte macthes - return if all of pattern is matched
                if (++patpos == pattern_len_)
                    return pp + spos - pattern_len_ + 1;
                ++spos;
            }
        }
    }
    else if (icase)
    {
        // Case insensitive search (ASCII or Unicode).
        // Note: This only works for the ASCII subset of Unicode
        // (where the high byte is zero and low byte == ASCII value).
        while (spos >= 0)
        {
            if (toupper(pp[spos]) != toupper(pattern_[patpos]))
            {
                // Match failed so skip back to start another comparison
                spos -= max(patpos + 1, 
                    min(bskip_[toupper(pp[spos])], bskip_[tolower(pp[spos])]));
                patpos = 0;
            }
            else
            {
                // One more byte macthes - return if all of pattern is matched
                if (++patpos == pattern_len_)
                    return pp + spos - pattern_len_ + 1;
                ++spos;
            }
        }
    }
    else
    {
        // While we haven't reached the start of the buffer to be searched
        while (spos >= 0)
        {
            if (pp[spos] != pattern_[patpos])
            {
                // Match failed so skip back to start another comparison
                spos -= max(patpos + 1, bskip_[pp[spos]]);
                patpos = 0;
            }
            else
            {
                // One more byte macthes - return if all of pattern is matched
                if (++patpos == pattern_len_)
                    return pp + spos - pattern_len_ + 1;
                ++spos;
            }
        }
    }
    return NULL;                // Pattern not matched
}
