Source code for algorithms.search.rabinkarp

"""
In computer science, the Rabin–Karp algorithm or Karp–Rabin algorithm is a string searching algorithm
created by Richard M. Karp and Michael O. Rabin (1987)
that uses hashing to find any one of a set of pattern strings in a text.
For text of length n and p patterns of combined length m,

its average and best case running time is O(n+m) in space O(p),
but its worst-case time is O(nm).

"""
import random


def _precompute_hashes(text, pattern_length, prime, x):
    """
    Function for precomputing hashes for text and length of pattern

    Args:
        text: text itself
        pattern_length: length of pattern
        prime: prime number to get modulo of
        x: x random number between 1 and prime

    Returns:
        list of hashes
    """

    hash_list = [0 for _ in range(0, len(text) - pattern_length + 1)]
    S = text[len(text) - pattern_length:len(text)]
    hash_list[len(text) - pattern_length] = _poly_hash(S, prime, x)
    y = 1
    for i in range(1, pattern_length + 1):
        y = (y * x) % prime
    for i in reversed(range(0, len(text) - pattern_length)):
        hash_list[i] = (x * hash_list[i + 1] + ord(text[i]) - y * ord(text[i + pattern_length])) % prime

    return hash_list


def _poly_hash(text, prime, x):
    """
    Function for generating hash value from text

    Args:
        text: simple text
        prime: prime number to get modulo of
        x: random number between 1 and prime

    Returns:
        hash value of text
    """

    ans = 0
    for c in reversed(text):
        ans = (ans * x + ord(c)) % prime
    return ans


def _find_positions(hash_list, pHash, pattern, text):
    """
    Function for finding position of pattern in text

    Args:
        hash_list: list of hashes from text
        pHash: hash of pattern
        pattern: pattern itself
        text:  text itself

    Returns:
        list of positions where pattern was found

    """

    result = []
    for i in range(0, len(text) - len(pattern) + 1):
        if pHash != hash_list[i]:
            continue
        if text[i:i + len(pattern)] == pattern:  # Only if hashes are the same we compare text symbol by symbol
            result.append(i)
    return result


[docs]def rabin_karp(text, pattern): """ Rabin-Karp algorithm that finds all occurrences of pattern in text Args: text: text to search in pattern: pattern to search for Returns: list of position where pattern placed in text Examples: >>> rabin_karp('AABAACAADAABAABA', 'AABA') [0, 9, 12] >>> rabin_karp('aaaaa', 'aaa') [0, 1, 2] """ prime = 100000007 x = random.randint(1, prime) pHash = _poly_hash(pattern, prime, x) hash_list = _precompute_hashes(text, len(pattern), prime, x) return _find_positions(hash_list, pHash, pattern, text)