Boyer-Moore algorithm C++ implementation

2013-07-25

Here is the Boyer-Moore algorithm C++ implementation.

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <string.h>
#include <algorithm>
#define ALPHABET_LENGTH 256
#define MAX(x, y) (x) > (y) ? (x) : (y)

using namespace std;

vector <int> L1;
vector <int> L2;

typedef struct ListNode
{
    struct ListNode *Next;
    int Index;
} ListNode;

typedef struct List
{
    struct ListNode *curr, *start;
} List;

List *ListCreate(void)
{
    List *list = new List();
    list->start = list->curr = NULL;
    return list;
}

void ListAddFirst(List *list, int index)
{
    ListNode *node = new ListNode();
    node->Index = index;
    node->Next = list->start;
    list->curr = list->start = node;
}

ListNode *ListMovePtr(List *list)
{
    if (list->curr != NULL)
        list->curr = list->curr->Next;
    return list->curr;
}

void ListReset(List *list)
{
    list->curr = list->start;
}

void ExitOnError(const char *msg)
{
    printf("Error: %s\r\n", msg);
    exit(0);
}

// count z-function for linear time O(n)
vector <int> z_function(string s)
{
    int n = (int)s.length();
    vector <int> z(n);
    for (int i = 1, l = 0, r = 0; i < n; i++)
    {
        if (i <= r)
            z[i] = min(r - i + 1, z[i-l]);
        while (i + z[i] < n && s[z[i]] == s[i + z[i]])
            z[i]++;
        if (i + z[i] - 1 > r)
            l = i, r = i + z[i] - 1;
    }
    return z;
}

vector <int> n_function(string s)
{
    reverse(s.begin(), s.end());
    vector <int> res = z_function(s);
    reverse(res.begin(), res.end());
    return res;
}

// calculate max suffix of P[1..i] equal to some suffix of P
void CalculateLFuncs(vector <int> n_fun)
{
    int len = (int)n_fun.size();
    L1.resize(len);
    L2.resize(len);
    for (int i = 0; i < len; i++)
        if (n_fun[i])
            L1[len - n_fun[i]] = i + 1;
    L2[len - 1] = n_fun[0];

    for (int i = 0; i < len; i++)
    {
        int q = n_fun[i] == i + 1 ? n_fun[i] : 0;
        L2[len - i - 1] = MAX(q, L2[len - i]);
    }
}

// build bad symbol shift table
void BuildBadSymShiftTable(string s, List **list)
{
    for (int i = 0; i < (int)s.size(); i++)
        ListAddFirst(list[s[i]], i);

}

int BoyerMoore(string text, string pattern, List **entries)
{
    int s_len = (int)text.size();
    int p_len = (int)pattern.size();
    int k = p_len - 1;
    int last_entry = -1;
    int not_occured = 1;
    while (k < s_len && not_occured)
    {
        int i = p_len - 1;
        int h = k;
        for (; i >= 0 && pattern[i] == text[h]; i--, h--)
            ListMovePtr(entries[pattern[i]]);
        if (i < 0)
        {
            last_entry = h + 2;
            k += p_len - L2[1];
            not_occured = 0;
        }
        else
        {
            int stop_symbol_rule = !entries]->curr ? i + 1 : i - entries]->curr->Index;
            int good_suff_rule = i == p_len - 1 ? 1 : L1[i + 1] == 0 ? p_len - L2[i + 1] - 1: p_len - L1[i + 1];
            k += MAX(stop_symbol_rule, good_suff_rule);
        }
        for (i = 0; i < ALPHABET_LENGTH; i++)
            ListReset(entries[i]);
    }
    return last_entry;
}

int main(void)
{
    FILE *fin = freopen("input.txt", "r", stdin);
    FILE *fout = freopen("output.txt", "w", stdout);
    if (!fin)
        ExitOnError("cannot open input file");
    if (!fout)
        ExitOnError("cannot open output file");
    string text, pattern;
    getline(cin, text);
    getline(cin, pattern);

    List *entries[ALPHABET_LENGTH];
    for (int i = 0; i < ALPHABET_LENGTH; i++)
        entries[i] = ListCreate();

    BuildBadSymShiftTable(pattern, entries);
    CalculateLFuncs(n_function(pattern));
    cout << BoyerMoore(text, pattern, entries);
    fclose(fin);
    fclose(fout);
}