# TODO format templates in results to match coinflip.cli.echo_series
from collections import Counter
from dataclasses import dataclass
from math import exp
from math import floor
from math import isclose
from math import log2
from math import sqrt
from random import choice
from typing import List
from scipy.special import gammaincc
from scipy.special import hyp1f1
from coinflip.randtests._decorators import randtest
from coinflip.randtests._result import TestResult
from coinflip.randtests._result import make_testvars_table
from coinflip.randtests._testutils import check_recommendations
from coinflip.randtests._testutils import rawblocks
__all__ = ["non_overlapping_template_matching", "overlapping_template_matching"]
def make_template(series, blocksize):
"""Generate a random template"""
template_size = min(max(floor(sqrt(blocksize)), 2), 12)
values = series.unique()
template = [choice(values) for _ in range(template_size)]
return template
# ------------------------------------------------------------------------------
# Non-overlapping Template Matching Test
[docs]@randtest(rec_input=288) # template_size=9, nblocks=8, blocksize=4*template_size
def non_overlapping_template_matching(series, template: List = None, nblocks=None):
"""Matches of template per block is compared to expected result
The sequence is split into blocks, where the number of non-overlapping
matches to the template in each block is found. This is referenced to the
expected mean and variance in matches of a hypothetically truly random RNG.
Parameters
----------
sequence : array-like
Output of the RNG being tested
template : `List`, optional
Template to match with the sequence, randomly generated if not
provided.
nblocks : `int`
Number of blocks to split sequence into
Returns
-------
TestResult
Dataclass that contains the test's statistic and p-value.
Raises
------
TemplateContainsElementsNotInSequenceError
If template contains values not present in sequence
"""
n = len(series)
if not nblocks:
nblocks = min(floor(sqrt(n)), 100)
blocksize = n // nblocks
check_recommendations(
{
"nblocks ≤ 100": nblocks <= 100,
"blocksize > 0.01 * n": blocksize > 0.01 * n,
"nblocks ≡ ⌊n / blocksize⌋": nblocks == n // blocksize,
}
)
if template is None:
template = make_template(series, blocksize)
template_size = len(template)
matches_expect = (blocksize - template_size + 1) / 2 ** template_size
variance = blocksize * (
(1 / 2 ** template_size) - ((2 * template_size - 1)) / 2 ** (2 * template_size)
)
block_matches = []
for block_tup in rawblocks(series, blocksize=blocksize):
matches = 0
pointer = 0
boundary = blocksize - template_size
while pointer < boundary:
window = block_tup[pointer : pointer + template_size]
if all(x == y for x, y in zip(window, template)):
matches += 1
pointer += template_size
else:
pointer += 1
block_matches.append(matches)
match_diffs = [matches - matches_expect for matches in block_matches]
statistic = sum(diff ** 2 / variance for diff in match_diffs)
p = gammaincc(nblocks / 2, statistic / 2)
return NonOverlappingTemplateMatchingTestResult(
statistic, p, template, matches_expect, variance, block_matches, match_diffs,
)
@dataclass
class NonOverlappingTemplateMatchingTestResult(TestResult):
template: List
matches_expect: float
variance: float
block_matches: List[int]
match_diffs: List[float]
def __rich_console__(self, console, options):
yield self._results_text("chi-square")
yield ""
yield f"template: {self.template}"
f_matches_expect = round(self.matches_expect, 1)
yield f"expected matches per block: {f_matches_expect}"
matches_count = Counter(self.block_matches)
table = sorted(matches_count.items())
f_table = make_testvars_table("matches", "nblocks")
for matches, nblocks in table:
f_table.add_row(str(matches), str(nblocks))
yield f_table
# ------------------------------------------------------------------------------
# Overlapping Template Matching Test
matches_ceil = 5
# TODO Review paper "Correction of Overlapping Template Matching Test Included in
# NIST Randomness Test Suite"
[docs]@randtest(rec_input=288) # TODO appropiate min input
def overlapping_template_matching(series, template: List = None, nblocks=None, df=5):
"""Overlapping matches of template per block is compared to expected result
The sequence is split into `nblocks` blocks, where the number of
overlapping matches to the template in each block is found. This is
referenced to the expected mean and variance in matches of a hypothetically
truly random RNG.
.. deprecated:: 0
`df` will be removed once I figure out the correct value, as I don't
quite understand what NIST wants (or if they're even correct!)
Parameters
----------
sequence : array-like
Output of the RNG being tested
template : `List`, optional
Template to match with the sequence, randomly generated if not
provided.
nblocks : `int`
Number of blocks to split sequence into
df : `int`, default `5`
Degrees of freedom to use in p-value calculation
Returns
-------
TestResult
Dataclass that contains the test's statistic and p-value.
Raises
------
TemplateContainsElementsNotInSequenceError
If template contains values not present in sequence
"""
n = len(series)
if not nblocks:
nblocks = floor(sqrt(n))
blocksize = n // nblocks
if not template:
template = make_template(series, blocksize)
template_size = len(template)
lambda_ = (blocksize - template_size + 1) / 2 ** template_size
eta = lambda_ / 2
first_prob = exp(-eta)
probabilities = [first_prob]
for matches in range(1, matches_ceil):
prob = ((eta * exp(-2 * eta)) / 2 ** matches) * hyp1f1(matches + 1, 2, eta)
probabilities.append(prob)
last_prob = 1 - sum(probabilities)
probabilities.append(last_prob)
check_recommendations(
{
"n ≥ nblocks * blocksize": n >= nblocks * blocksize,
"nblocks * min(probabilities) > df": nblocks * min(probabilities) > df,
"λ ≈ 2": isclose(lambda_, 2),
"len(template) ≈ log2(nblocks)": isclose(template_size, log2(nblocks)),
"df ≈ λ": isclose(template_size, log2(nblocks)),
}
)
expected_tallies = [prob * nblocks for prob in probabilities]
block_matches = []
for block_tup in rawblocks(series, blocksize=blocksize):
matches = 0
boundary = blocksize - template_size
for pointer in range(boundary + 1):
window = block_tup[pointer : pointer + template_size]
if all(x == y for x, y in zip(window, template)):
matches += 1
block_matches.append(matches)
tallies = [0 for _ in range(matches_ceil + 1)]
for matches in block_matches:
i = min(matches, 5)
tallies[i] += 1
reality_check = []
for tally_expect, tally in zip(expected_tallies, tallies):
diff = (tally - tally_expect) ** 2 / tally_expect
reality_check.append(diff)
statistic = sum(reality_check)
p = gammaincc(df / 2, statistic / 2) # TODO should first param be df / 2
return OverlappingTemplateMatchingTestResult(
statistic, p, template, expected_tallies, tallies,
)
@dataclass
class OverlappingTemplateMatchingTestResult(TestResult):
template: List
expected_tallies: List[int]
tallies: List[int]
def __post_init__(self):
self.tally_diffs = []
for expect, actual in zip(self.expected_tallies, self.tallies):
diff = actual - expect
self.tally_diffs.append(diff)
def __rich_console__(self, console, options):
yield self._results_text("chi-square")
yield ""
yield f"template: {self.template}"
f_nmatches = [f"{x}" for x in range(matches_ceil + 1)]
f_nmatches[-1] = f"{f_nmatches[-1]}+"
table = zip(f_nmatches, self.tallies, self.expected_tallies, self.tally_diffs)
f_table = make_testvars_table("matches", "count", "expected", "diff")
for f_matches, count, count_expect, diff in table:
f_count = str(count)
f_count_expect = str(round(count_expect, 1))
f_diff = str(round(diff, 1))
f_table.add_row(f_matches, f_count, f_count_expect, f_diff)
yield f_table