Source code for coinflip.randtests.template

# TODO format templates in results to match coinflip.cli.echo_series
from collections import Counter
from dataclasses import dataclass
from math import exp
from math import floor
from math import isclose
from math import log2
from math import sqrt
from random import choice
from typing import List

from scipy.special import gammaincc
from scipy.special import hyp1f1

from coinflip.randtests._decorators import randtest
from coinflip.randtests._result import TestResult
from coinflip.randtests._result import make_testvars_table
from coinflip.randtests._testutils import check_recommendations
from coinflip.randtests._testutils import rawblocks

__all__ = ["non_overlapping_template_matching", "overlapping_template_matching"]


def make_template(series, blocksize):
    """Generate a random template"""
    template_size = min(max(floor(sqrt(blocksize)), 2), 12)

    values = series.unique()
    template = [choice(values) for _ in range(template_size)]

    return template


# ------------------------------------------------------------------------------
# Non-overlapping Template Matching Test


[docs]@randtest(rec_input=288)  # template_size=9, nblocks=8, blocksize=4*template_size
def non_overlapping_template_matching(series, template: List = None, nblocks=None):
    """Matches of template per block is compared to expected result

    The sequence is split into blocks, where the number of non-overlapping
    matches to the template in each block is found. This is referenced to the
    expected mean and variance in matches of a hypothetically truly random RNG.

    Parameters
    ----------
    sequence : array-like
        Output of the RNG being tested
    template : `List`, optional
        Template to match with the sequence, randomly generated if not
        provided.
    nblocks : `int`
        Number of blocks to split sequence into

    Returns
    -------
    TestResult
        Dataclass that contains the test's statistic and p-value.

    Raises
    ------
    TemplateContainsElementsNotInSequenceError
        If template contains values not present in sequence
    """
    n = len(series)

    if not nblocks:
        nblocks = min(floor(sqrt(n)), 100)
    blocksize = n // nblocks

    check_recommendations(
        {
            "nblocks ≤ 100": nblocks <= 100,
            "blocksize > 0.01 * n": blocksize > 0.01 * n,
            "nblocks ≡ ⌊n / blocksize⌋": nblocks == n // blocksize,
        }
    )

    if template is None:
        template = make_template(series, blocksize)
    template_size = len(template)

    matches_expect = (blocksize - template_size + 1) / 2 ** template_size
    variance = blocksize * (
        (1 / 2 ** template_size) - ((2 * template_size - 1)) / 2 ** (2 * template_size)
    )

    block_matches = []
    for block_tup in rawblocks(series, blocksize=blocksize):
        matches = 0
        pointer = 0

        boundary = blocksize - template_size
        while pointer < boundary:
            window = block_tup[pointer : pointer + template_size]

            if all(x == y for x, y in zip(window, template)):
                matches += 1
                pointer += template_size
            else:
                pointer += 1

        block_matches.append(matches)

    match_diffs = [matches - matches_expect for matches in block_matches]

    statistic = sum(diff ** 2 / variance for diff in match_diffs)
    p = gammaincc(nblocks / 2, statistic / 2)

    return NonOverlappingTemplateMatchingTestResult(
        statistic, p, template, matches_expect, variance, block_matches, match_diffs,
    )


@dataclass
class NonOverlappingTemplateMatchingTestResult(TestResult):
    template: List
    matches_expect: float
    variance: float
    block_matches: List[int]
    match_diffs: List[float]

    def __rich_console__(self, console, options):
        yield self._results_text("chi-square")

        yield ""

        yield f"template: {self.template}"

        f_matches_expect = round(self.matches_expect, 1)
        yield f"expected matches per block: {f_matches_expect}"

        matches_count = Counter(self.block_matches)
        table = sorted(matches_count.items())
        f_table = make_testvars_table("matches", "nblocks")
        for matches, nblocks in table:
            f_table.add_row(str(matches), str(nblocks))
        yield f_table


# ------------------------------------------------------------------------------
# Overlapping Template Matching Test


matches_ceil = 5


# TODO Review paper "Correction of Overlapping Template Matching Test Included in
#                    NIST Randomness Test Suite"
[docs]@randtest(rec_input=288)  # TODO appropiate min input
def overlapping_template_matching(series, template: List = None, nblocks=None, df=5):
    """Overlapping matches of template per block is compared to expected result

    The sequence is split into `nblocks` blocks, where the number of
    overlapping matches to the template in each block is found. This is
    referenced to the expected mean and variance in matches of a hypothetically
    truly random RNG.

    .. deprecated:: 0
        `df` will be removed once I figure out the correct value, as I don't
        quite understand what NIST wants (or if they're even correct!)

    Parameters
    ----------
    sequence : array-like
        Output of the RNG being tested
    template : `List`, optional
        Template to match with the sequence, randomly generated if not
        provided.
    nblocks : `int`
        Number of blocks to split sequence into
    df : `int`, default `5`
        Degrees of freedom to use in p-value calculation

    Returns
    -------
    TestResult
        Dataclass that contains the test's statistic and p-value.

    Raises
    ------
    TemplateContainsElementsNotInSequenceError
        If template contains values not present in sequence
    """
    n = len(series)

    if not nblocks:
        nblocks = floor(sqrt(n))
    blocksize = n // nblocks

    if not template:
        template = make_template(series, blocksize)
    template_size = len(template)

    lambda_ = (blocksize - template_size + 1) / 2 ** template_size
    eta = lambda_ / 2

    first_prob = exp(-eta)
    probabilities = [first_prob]
    for matches in range(1, matches_ceil):
        prob = ((eta * exp(-2 * eta)) / 2 ** matches) * hyp1f1(matches + 1, 2, eta)
        probabilities.append(prob)
    last_prob = 1 - sum(probabilities)
    probabilities.append(last_prob)

    check_recommendations(
        {
            "n ≥ nblocks * blocksize": n >= nblocks * blocksize,
            "nblocks * min(probabilities) > df": nblocks * min(probabilities) > df,
            "λ ≈ 2": isclose(lambda_, 2),
            "len(template) ≈ log2(nblocks)": isclose(template_size, log2(nblocks)),
            "df ≈ λ": isclose(template_size, log2(nblocks)),
        }
    )

    expected_tallies = [prob * nblocks for prob in probabilities]

    block_matches = []
    for block_tup in rawblocks(series, blocksize=blocksize):
        matches = 0

        boundary = blocksize - template_size
        for pointer in range(boundary + 1):
            window = block_tup[pointer : pointer + template_size]

            if all(x == y for x, y in zip(window, template)):
                matches += 1

        block_matches.append(matches)

    tallies = [0 for _ in range(matches_ceil + 1)]
    for matches in block_matches:
        i = min(matches, 5)
        tallies[i] += 1

    reality_check = []
    for tally_expect, tally in zip(expected_tallies, tallies):
        diff = (tally - tally_expect) ** 2 / tally_expect
        reality_check.append(diff)

    statistic = sum(reality_check)

    p = gammaincc(df / 2, statistic / 2)  # TODO should first param be df / 2

    return OverlappingTemplateMatchingTestResult(
        statistic, p, template, expected_tallies, tallies,
    )


@dataclass
class OverlappingTemplateMatchingTestResult(TestResult):
    template: List
    expected_tallies: List[int]
    tallies: List[int]

    def __post_init__(self):
        self.tally_diffs = []
        for expect, actual in zip(self.expected_tallies, self.tallies):
            diff = actual - expect
            self.tally_diffs.append(diff)

    def __rich_console__(self, console, options):
        yield self._results_text("chi-square")

        yield ""

        yield f"template: {self.template}"

        f_nmatches = [f"{x}" for x in range(matches_ceil + 1)]
        f_nmatches[-1] = f"{f_nmatches[-1]}+"

        table = zip(f_nmatches, self.tallies, self.expected_tallies, self.tally_diffs)
        f_table = make_testvars_table("matches", "count", "expected", "diff")
        for f_matches, count, count_expect, diff in table:
            f_count = str(count)
            f_count_expect = str(round(count_expect, 1))
            f_diff = str(round(diff, 1))
            f_table.add_row(f_matches, f_count, f_count_expect, f_diff)

        yield f_table