Boyer Moore

using System;

namespace Algorithms.Strings
{
    /// <summary>
    ///     The idea:   You compare the pattern with the text from right to left.
    ///     If the text symbol that is compared with the rightmost pattern symbol
    ///     does not occur in the pattern at all, then the pattern can be shifted
    ///     by m positions behind this text symbol.
    ///     Complexity:
    ///     Time:   Preprocessing: O(m²)
    ///     Comparison: O(mn)
    ///     Space:  O(m + a)
    ///     where   m - pattern length
    ///     n - text length
    ///     a - alphabet length.
    ///     Source:     https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/bmen.htm
    ///     https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm.
    /// </summary>
    public static class BoyerMoore
    {
        /// <summary>
        ///     Finds the index of the first occurrence of the pattern <c>p</c> in <c>t</c>.
        /// </summary>
        /// <param name="t">Input text.</param>
        /// <param name="p">Search pattern.</param>
        /// <returns>Index of the pattern in text or -1 if the pattern  was not found.</returns>
        public static int FindFirstOccurrence(string t, string p)
        {
            // Pattern length
            var m = p.Length;

            // Text length
            var n = t.Length;

            // For each symbol of the alphabet, the position of its rightmost occurrence in the pattern,
            // or -1 if the symbol does not occur in the pattern.
            int[] badChar = BadCharacterRule(p, m);

            // Each entry goodSuffix[i] contains the shift distance of the pattern
            // if a mismatch at position i – 1 occurs, i.e. if the suffix of the pattern starting at position i has matched.
            int[] goodSuffix = GoodSuffixRule(p, m);

            // Index in text
            var i = 0;

            // Index in pattern
            int j;

            while (i <= n - m)
            {
                // Starting at end of pattern
                j = m - 1;

                // While matching
                while (j >= 0 && p[j] == t[i + j])
                {
                    j--;
                }

                // Pattern found
                if (j < 0)
                {
                    return i;
                }

                // Pattern is shifted by the maximum of the values
                // given by the good-suffix and the bad-character heuristics
                i += Math.Max(goodSuffix[j + 1], j - badChar[t[i + j]]);
            }

            // Pattern not found
            return -1;
        }

        /// <summary>
        ///     Finds out the position of its rightmost occurrence in the pattern for each symbol of the alphabet,
        ///     or -1 if the symbol does not occur in the pattern.
        /// </summary>
        /// <param name="p">Search pattern.</param>
        /// <param name="m">Length of the pattern.</param>
        /// <returns>Array of the named postition for each symbol of the alphabet.</returns>
        private static int[] BadCharacterRule(string p, int m)
        {
            // For each character (note that there are more than 256 characters)
            int[] badChar = new int[256];
            Array.Fill(badChar, -1);

            // Iterate from left to right over the pattern
            for (var j = 0; j < m; j++)
            {
                badChar[p[j]] = j;
            }

            return badChar;
        }

        /// <summary>
        ///     Finds out the shift distance of the pattern if a mismatch at position i – 1 occurs
        ///     for each character of the pattern, i.e. if the suffix of the pattern starting at position i has matched.
        /// </summary>
        /// <param name="p">Search pattern.</param>
        /// <param name="m">Length of the pattern.</param>
        /// <returns>Array of the named shift distance for each character of the pattern.</returns>
        private static int[] GoodSuffixRule(string p, int m)
        {
            // CASE 1
            // The matching suffix occurs somewhere else in the pattern
            // --> matching suffix is a border of a suffix of the pattern

            // f[i] contains starting position of the widest border of the suffix of the pattern beginning at position i
            int[] f = new int[p.Length + 1];

            // Suffix of p[m] has no border --> f[m] = m+1
            f[m] = m + 1;

            // Corresponding shift distance
            int[] s = new int[p.Length + 1];

            // Start of suffix including border of the pattern
            // (hint: https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm#section2)
            var i = m;

            // Start of suffix of the pattern
            var j = m + 1;

            while (i > 0)
            {
                // checking if a shorter border that is already known can be extended to the left by the same symbol
                while (j <= m && p[i - 1] != p[j - 1])
                {
                    if (s[j] == 0)
                    {
                        s[j] = j - i;
                    }

                    j = f[j];
                }

                --i;
                --j;
                f[i] = j;
            }

            // CASE 2
            // Only a part of the matching suffix occurs at the beginning of the pattern
            // (filling remaining entries)
            j = f[0];
            for (i = 0; i <= m; i++)
            {
                // Starting postition of the greates border
                if (s[i] == 0)
                {
                    s[i] = j;
                }

                // From position i = j, it switches to the next narrower border f[j]
                if (i == j)
                {
                    j = f[j];
                }
            }

            return s;
        }
    }
}
What is an algorithm?About us Programming Languages Contribute Contact
GitHub Gitter Twitter Source code Analytics