Module `wuggy.generators.wuggygenerator`

Expand source code

import codecs
import copy
import importlib
import inspect
import os
from collections import defaultdict, namedtuple
from csv import writer
from fractions import Fraction
from functools import wraps
from pathlib import Path
from shutil import rmtree
from sys import stdout
from time import time
from typing import Dict, Generator, Optional, Union
from urllib.request import urlopen
from warnings import warn

from ..plugins.baselanguageplugin import BaseLanguagePlugin
from ..utilities.bigramchain import BigramChain


def _loaded_language_plugin_required(func):
    """
    Decorator used for regular Wuggy methods to ensure that a valid language plugin is loaded before execution.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        if not hasattr(args[0], 'language_plugin'):
            raise Exception(
                "This function cannot be called if no language plugin is loaded!")
        return func(*args, **kwargs)
    return wrapper


def _loaded_language_plugin_required_generator(func):
    """
    Decorator used for Wuggy generator methods to ensure that a valid language plugin is loaded before execution.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        if not hasattr(args[0], 'language_plugin'):
            raise Exception(
                "The generator cannot be iterated if no language plugin is loaded!")
        gen = func(*args, **kwargs)
        for val in gen:
            yield val
    return wrapper


class WuggyGenerator():
    def __init__(self):
        self.bigramchain = None
        self.bigramchains = {}
        self.supported_official_language_plugin_names = [
            "orthographic_basque",
            "orthographic_dutch",
            "orthographic_english",
            "orthographic_french",
            "orthographic_german",
            "orthographic_italian",
            "orthographic_polish",
            "orthographic_serbian_cyrillic",
            "orthographic_serbian_latin",
            "orthographic_spanish",
            "orthographic_vietnamese",
            "orthographic_estonian",
            "phonetic_english_celex",
            "phonetic_english_cmu",
            "phonetic_french",
            "phonetic_italian"]
        self.__official_language_plugin_repository_url = "https://raw.githubusercontent.com/WuggyCode/wuggy_language_plugin_data/master"
        self.attribute_subchain = None
        self.frequency_subchain = None
        self.reference_sequence = None
        self.frequency_filter = None
        self.current_sequence = None
        self.output_mode = None
        self.supported_statistics = ()
        self.supported_attribute_filters = {}
        self.attribute_filters = {}
        self.default_attributes = []
        self.statistics = {}
        self.word_lexicon = defaultdict(list)
        self.neighbor_lexicon = []
        self.reference_statistics = {}
        self.stat_cache = {}
        self.sequence_cache = []
        self.difference_statistics = {}
        self.match_statistics = {}
        self.lookup_lexicon = {}

    def load(self, language_plugin_name: str,
             local_language_plugin: BaseLanguagePlugin = None) -> None:
        """
        Loads in a language plugin, if available, and stores the corresponding bigramchains.
        Parameters:
            language_plugin_name: must be the exact string of an official language plugin (see self.supported_official_language_plugin_names). If you are loading in a local plugin, the name can be anything as long as it does not conflict with an already loaded plugin name.

            local_language_plugin: must be a child class of BaseLanguagePlugin: see BaseLanguagePlugin for more information on how to create a custom language plugin.
        """
        if local_language_plugin:
            # TODO: if someone does not pass a class INSTANCE, they get TypeError: <class 'type'> is a built-in class, this is a vague error and probably should be abstracted
            self.language_plugin_data_path = os.path.dirname(
                inspect.getfile(local_language_plugin.__class__))
            self.language_plugin_name = language_plugin_name
            language_plugin = local_language_plugin

        if local_language_plugin is None:
            if language_plugin_name not in self.supported_official_language_plugin_names:
                raise ValueError(
                    "This language is not officially supported by Wuggy at this moment. If this is a local plugin, pass the local_language_plugin")
            self.language_plugin_name = language_plugin_name
            language_plugins_folder_dirname = os.path.join(
                Path(__file__).parents[1], "plugins", "language_data")
            self.language_plugin_data_path = os.path.join(
                language_plugins_folder_dirname, language_plugin_name)
            if not os.path.exists(self.language_plugin_data_path):
                self.download_language_plugin(
                    language_plugin_name)
            # Official language plugins MUST have the class name "OfficialLanguagePlugin"!
            language_plugin = importlib.import_module(
                f".plugins.language_data.{language_plugin_name}.{language_plugin_name}",
                "wuggy").OfficialLanguagePlugin()

        if language_plugin_name not in self.bigramchains:
            default_data_path = os.path.join(
                self.language_plugin_data_path, language_plugin.default_data)

            data_file = codecs.open(default_data_path, 'r', encoding='utf-8')
            self.bigramchains[self.language_plugin_name] = BigramChain(
                language_plugin)
            self.bigramchains[self.language_plugin_name].load(
                data_file)
        self.__activate(self.language_plugin_name)

    @staticmethod
    def remove_downloaded_language_plugins() -> None:
        """
        Removes all downloaded (official) language plugins.
        Useful to cleanup after an experiment or to remove corrupt language plugins.
        """
        try:
            rmtree(os.path.join(Path(__file__).parents[1], "plugins", "language_data"))
        except FileNotFoundError as err:
            raise FileNotFoundError(
                "The official language plugin folder is already removed.") from err

    def download_language_plugin(
            self, language_plugin_name: str, auto_download=False) -> None:
        """
        Downloads and saves given language plugin to local storage from the corresponding official file repository.
        This method is called when you load in a language plugin automatically and you are missing the plugin locally.
        If you need to ensure your Wuggy script works on any machine without user confirmation, execute this method with the auto_download flag set to True before using the load method.
        Parameters:
            language_plugin_name: this is the name for the official language plugin you want to download. If the language name is not officially supported, the method will throw an error.

            auto_download: determines whether Wuggy provides the user with a prompt to confirm downloading a language plugin.
        """
        if language_plugin_name not in self.supported_official_language_plugin_names:
            raise ValueError("This language is not officially supported by Wuggy at this moment.")

        if not auto_download:
            while True:
                stdout.write(
                    f"The language plugin {language_plugin_name} was not found in local storage. Do you allow Wuggy to download this plugin? [y/n]\n")
                choice = input().lower()
                if (not (choice.startswith("y") or choice.startswith("n"))):
                    stdout.write("Please respond with 'y' or 'n'")
                elif choice.startswith("n"):
                    raise ValueError(
                        "User declined permission for Wuggy to download necessary language plugin.")
                else:
                    break

        language_plugins_folder_dirname = os.path.join(
            Path(__file__).parents[1], "plugins", "language_data")
        if not os.path.exists(language_plugins_folder_dirname):
            os.makedirs(language_plugins_folder_dirname)

        self.language_plugin_data_path = os.path.join(
            language_plugins_folder_dirname, language_plugin_name)
        if not os.path.exists(self.language_plugin_data_path):
            os.makedirs(self.language_plugin_data_path)

        print(
            f"Wuggy is currently downloading the plugin {language_plugin_name} for you from the official repository...")
        
        py_file_name = f"{language_plugin_name}.py"
        print(f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")
        py_file = urlopen(
            f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")

        file = open(f'{self.language_plugin_data_path}/{py_file_name}',
                    'w', encoding="utf-8")
        # The current setup assumes that every official Wuggy language plugin use a single data file
        for line in py_file:
            file.write(line.decode("utf-8"))
        data_file_name = f"{language_plugin_name}.txt"
        data_file = urlopen(
            f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{data_file_name}")
        file = open(f'{self.language_plugin_data_path}/{data_file_name}',
                    'w', encoding="utf-8")

        for line in data_file:
            file.write(line.decode("utf-8"))

        print(f"Wuggy has finished downloading {language_plugin_name}.")

    def __activate(self, name: str) -> None:
        """
        Activate a language plugin by setting the corresponding bigramchains and lexicon properties.
        This deactivates and garbage collects any previously activated language plugin.
        Should only be called internally, do not call on your own.
        """
        if isinstance(name, type(codecs)):
            name = name.__name__
        self.bigramchain = self.bigramchains[name]
        self.language_plugin = self.bigramchain.language_plugin
        self.__load_neighbor_lexicon()
        self.__load_word_lexicon()
        self.__load_lookup_lexicon()
        self.supported_statistics = self.__get_statistics()
        self.supported_attribute_filters = self.__get_attributes()
        self.default_attributes = self.__get_default_attributes()
        self.current_language_plugin_name = name

    def __load_word_lexicon(self) -> None:
        """
        Loads the default word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        cutoff = 0
        data_file = codecs.open(
            "%s/%s" % (self.language_plugin_data_path, self.language_plugin.default_word_lexicon),
            'r', encoding="utf-8")
        self.word_lexicon = defaultdict(list)
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split('\t')
            word = fields[0]
            frequency_per_million = fields[-1]
            if float(frequency_per_million) >= cutoff:
                self.word_lexicon[word[0], len(word)].append(word)
        data_file.close()

    def __load_neighbor_lexicon(self) -> None:
        """
        Loads the default neighbor word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        cutoff = 0
        data_file = codecs.open(
            "%s/%s" %
            (self.language_plugin_data_path,
             self.language_plugin.default_neighbor_lexicon),
            'r',
            encoding="utf-8")
        self.neighbor_lexicon = []
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split('\t')
            word = fields[0]
            frequency_per_million = fields[-1]
            if float(frequency_per_million) >= cutoff:
                self.neighbor_lexicon.append(word)
        data_file.close()

    def __load_lookup_lexicon(self, data_file: bool = None) -> None:
        """
        Loads the default lookup word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        self.lookup_lexicon = {}
        if data_file is None:
            data_file = codecs.open(
                "%s/%s" %
                (self.language_plugin_data_path,
                 self.language_plugin.default_lookup_lexicon),
                'r',
                encoding="utf-8")
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split(self.language_plugin.separator)
            reference, representation = fields[0:2]
            self.lookup_lexicon[reference] = representation
        data_file.close()

    def lookup_reference_segments(self, reference: str) -> Optional[str]:
        """
        Look up a given reference (word) from the currently active lookup lexicon.
        Returns the segments of the found word, if the word is not found it returns None.
        This should be used before setting a word as a reference sequence.
        """
        return self.lookup_lexicon.get(reference, None)

    def __get_attributes(self) -> [namedtuple]:
        """
        Returns a list of all attribute fields of the currently activated language plugin as a named tuple.
        This should only be used internally, read the property "supported_attribute_filters" instead.
        """
        return self.language_plugin.Segment._fields

    def __get_default_attributes(self) -> [str]:
        """
        Returns a list of default attribute fields of the currently activated language plugin.
        This should only be used internally, read the property "default_attributes" instead.
        """
        return self.language_plugin.default_fields

    @_loaded_language_plugin_required
    def set_reference_sequence(self, sequence: str) -> None:
        """
        Set the reference sequence.
        This is commonly used before generate methods in order to set the reference word for which pseudowords should be generated.
        """
        self.reference_sequence = self.language_plugin.transform(
            sequence).representation
        self.reference_sequence_frequencies = self.bigramchain.get_frequencies(
            self.reference_sequence)
        self.__clear_stat_cache()
        for name in self.__get_statistics():
            function = eval("self.language_plugin.statistic_%s" % (name))
            self.reference_statistics[name] = function(
                self, self.reference_sequence)

    def __get_statistics(self) -> [str]:
        """
        Lists all statistics supported by a given language plugin.
        This should only be used internally, read the property "supported_statistics" instead.
        """
        names = [name for name in dir(
            self.language_plugin) if name.startswith('statistic')]
        return [name.replace('statistic_', '') for name in names]

    def set_statistic(self, name: str) -> None:
        """
        Enable a statistic based on its name.
        """
        if name not in self.supported_statistics:
            raise ValueError(f"Statistic {name} is not supported.")
        self.statistics[name] = None

    def set_statistics(self, names: [str]) -> None:
        """
        Enables statistics based on their names.
        """
        for name in names:
            if name not in self.supported_statistics:
                self.statistics = {}
                raise ValueError(f"Statistic {name} is not supported.")
            self.statistics[name] = None

    def set_all_statistics(self) -> None:
        """
        Enable all statistics supported by the current active language plugin.
        Enabling all statistics increases word generation computation time, especially for statistics such as ned1.
        """
        self.set_statistics(self.supported_statistics)

    def apply_statistics(self, sequence: str = None) -> None:
        """
        Apply all statistics which were set beforehand.
        """
        if sequence is None:
            sequence = self.current_sequence
        for name in self.statistics:
            function = eval("self.language_plugin.statistic_%s" % (name))
            if (sequence, name) in self.stat_cache:
                self.statistics[name] = self.stat_cache[(sequence, name)]
            else:
                self.statistics[name] = function(self, sequence)
                self.stat_cache[(sequence, name)] = self.statistics[name]
            if 'match' in function.__dict__:
                self.match_statistics[name] = function.match(
                    self.statistics[name], self.reference_statistics[name])
            if 'difference' in function.__dict__:
                self.difference_statistics[name] = function.difference(
                    self.statistics[name], self.reference_statistics[name])

    def clear_statistics(self) -> None:
        """
        Clear all the statistics set previously.
        """
        self.statistics = {}

    def __clear_stat_cache(self) -> None:
        """
        Clears the statistics cache. Only used by Wuggy internally.
        """
        self.stat_cache = {}

    def __clear_sequence_cache(self) -> None:
        """
        Clears the sequence cache. Only used by Wuggy internally.
        """
        self.sequence_cache = []

    def list_output_modes(self) -> [str]:
        """
        List output modes of the currently activated language plugin.
        """
        names = [name for name in dir(
            self.language_plugin) if name.startswith('output')]
        return [name.replace('output_', '') for name in names]

    def set_output_mode(self, name: str) -> None:
        """
        Set an output mode supported by the currently activated language plugin.
        """
        if name not in self.list_output_modes():
            raise ValueError(f"Output mode {name} is not supported.")
        self.output_mode = eval("self.language_plugin.output_%s" % (name))

    def set_attribute_filter(self, name: str) -> None:
        """
        Set an attribute filter supported by the currently activated language plugin.
        """
        reference_sequence = self.reference_sequence
        if name not in self.supported_attribute_filters:
            raise ValueError(
                f"Attribute filter {name} is not supported.")
        self.attribute_filters[name] = reference_sequence
        self.attribute_subchain = None

    def set_attribute_filters(self, names: [str]) -> None:
        """
        Set attribute filters supported by the currently activated language plugin.
        """
        for name in names:
            self.set_attribute_filter(name)

    def __apply_attribute_filters(self) -> None:
        """
        Apply all set attribute filters.
        This is currently used by Wuggy internally, do not call on your own.
        """
        for attribute, reference_sequence in self.attribute_filters.items():
            subchain = self.attribute_subchain if self.attribute_subchain is not None else self.bigramchain
            self.attribute_subchain = subchain.attribute_filter(
                reference_sequence, attribute)

    def clear_attribute_filters(self) -> None:
        """
        Remove all set attribute filters.
        """
        self.attribute_filters = {}

    def set_frequency_filter(self, lower: int, upper: int) -> None:
        """
        Sets the frequency filter for concentric search.
        Stricter search (small values for lower and upper) result in faster word generation.
        """
        self.frequency_filter = (self.reference_sequence, lower, upper)

    def clear_frequency_filter(self) -> None:
        """
        Clear the previously set frequency filter.
        """
        self.frequency_filter = None
        self.frequency_subchain = None

    def apply_frequency_filter(self) -> None:
        """
        Apply the previously set frequency filter.
        """
        if self.frequency_filter is None:
            raise Exception("No frequency filter was set")
        reference_sequence, lower, upper = self.frequency_filter
        subchain = self.attribute_subchain if self.attribute_subchain is not None else self.bigramchain
        self.frequency_subchain = subchain.frequency_filter(
            reference_sequence, lower, upper)

    @_loaded_language_plugin_required
    def generate_classic(
            self, input_sequences: [str],
            ncandidates_per_sequence: int = 10, max_search_time_per_sequence: int = 10,
            subsyllabic_segment_overlap_ratio: Union[Fraction, None] = Fraction(2, 3),
            match_subsyllabic_segment_length: bool = True, match_letter_length: bool = True,
            output_mode: str = "plain", concentric_search: bool = True) -> [Dict]:
        """
        This is the classic method to generate pseudowords using Wuggy and can be called immediately after loading a language plugin.
        The defaults for this method are similar to those set in the legacy version of Wuggy, resulting in sensible pseudowords.
        This method returns a list of pseudoword matches, including all match and difference statistics (lexicality, ned1, old2, plain_length, deviation statistics...).
        Beware that this method always clears the sequence cache and all previously set filters.
        Parameters:
            input_sequences: these are the input sequences (words) for which you want to generate pseudowords.

            ncandidates_per_sequence: this is the n (maximum) amount of pseudowords you want to generate per input sequence.

            max_search_time_per_sequence: this is the maximum time in seconds to search for pseudowords per input sequence.

            subsyllabic_segment_overlap_ratio: this is the Fraction ratio for overlap between subsyllabic segments. The default ensures your pseudowords are very word-like but not easily identifiable as related to an existing word. If set to None, this constraint is not applied.

            match_subsyllabic_segment_length: determines whether the generated pseudowords must retain the same subsyllabic segment length as the respective input sequence.

            match_letter_length: determines whether the generated pseudowords must retain the same word length as the respective input sequence. This option is redundant if match_subsyllabic_segment_length is set to True.

            output_mode: output mode for pseudowords, constricted by the output modes supported by the currently loaded language plugin.

            concentric_search: enable/disable concentric search. Wuggy operates best and fastest when concentric search is enabled. First, the algorithm will try to generate candidates that exactly match the transition frequencies of the reference word. Then the maximal allowed deviation in transition frequencies will increase by powers of 2 (i.e., +/-2, +/-4, +/-8, etc.).
        .. include:: ../../documentation/wuggygenerator/generate_classic.md
        """
        pseudoword_matches = []
        for input_sequence in input_sequences:
            pseudoword_matches.extend(
                self.__generate_classic_inner(
                    input_sequence,
                    ncandidates_per_sequence,
                    max_search_time_per_sequence,
                    subsyllabic_segment_overlap_ratio,
                    match_subsyllabic_segment_length,
                    match_letter_length, output_mode, concentric_search))
        return pseudoword_matches

    def __generate_classic_inner(
            self, input_sequence: str, ncandidates_per_sequence: int, max_search_time: int,
            subsyllabic_segment_overlap_ratio: Union[Fraction, None],
            match_subsyllabic_segment_length: bool, match_letter_length: bool, output_mode: str,
            concentric_search: bool = True):
        """
        Inner method for generate_classic(), which outputs a list of pseudoword matches for an input sequence.
        Should only be used by WuggyGenerator internally.
        """
        self.__clear_sequence_cache()
        self.clear_attribute_filters()
        self.clear_frequency_filter()
        input_sequence_segments = self.lookup_reference_segments(input_sequence)
        if input_sequence_segments is None:
            raise Exception(
                f"Sequence {input_sequence} was not found in lexicon {self.current_language_plugin_name}")
        self.set_reference_sequence(input_sequence_segments)
        self.set_output_mode(output_mode)
        subchain = self.bigramchain
        starttime = time()
        pseudoword_matches = []
        frequency_exponent = 1
        if match_subsyllabic_segment_length:
            self.set_attribute_filter("segment_length")
            self.__apply_attribute_filters()
            subchain = self.attribute_subchain
        while True:
            if concentric_search:
                self.set_frequency_filter(
                    2**frequency_exponent, 2**frequency_exponent)
                frequency_exponent += 1
                self.apply_frequency_filter()
                subchain = self.frequency_subchain
            subchain = subchain.clean(len(self.reference_sequence) - 1)
            subchain.set_startkeys(self.reference_sequence)
            for sequence in subchain.generate():
                # Mandatory statistics before finding a suitable match
                self.clear_statistics()
                self.set_statistics(["overlap_ratio", "plain_length", "lexicality"])
                if (time() - starttime) >= max_search_time:
                    return pseudoword_matches
                if self.language_plugin.output_plain(sequence) in self.sequence_cache:
                    continue
                self.current_sequence = sequence
                self.apply_statistics()
                if (not match_subsyllabic_segment_length and match_letter_length and self.difference_statistics["plain_length"] != 0):
                    continue
                if (subsyllabic_segment_overlap_ratio is not None and self.statistics["overlap_ratio"] !=
                        subsyllabic_segment_overlap_ratio):
                    continue
                if self.statistics["lexicality"] == "W":
                    continue
                # (Re)apply all statistics only if match is found: else search becomes unnecessarily slow
                self.set_all_statistics()
                self.apply_statistics()
                self.sequence_cache.append(
                    self.language_plugin.output_plain(sequence))
                match = {"word": input_sequence,
                         "segments": input_sequence_segments,
                         "pseudoword": self.output_mode(sequence)}
                match.update({"statistics": self.statistics,
                              "difference_statistics": self.difference_statistics})

                pseudoword_matches.append(copy.deepcopy(match))
                if len(pseudoword_matches) >= ncandidates_per_sequence:
                    return pseudoword_matches

    @_loaded_language_plugin_required_generator
    def generate_advanced(self, clear_cache: bool = True) -> Union[Generator[str, None, None],
                                                                   Generator[tuple, None, None]]:
        """
        Creates a custom generator which can be iterated to return generated pseudowords.
        The generator's settings, such as output statistics, should be set by you before calling this method.
        If attributes such as \"output_mode\" are not set, sensible defaults are used.
        Note that this method is for advanced users and may result in unexpected results if handled incorrectly.
        .. include:: ../../documentation/wuggygenerator/generate_advanced.md
        """
        if clear_cache:
            self.__clear_sequence_cache()
        if self.output_mode is None:
            self.set_output_mode("plain")
        if len(self.attribute_filters) == 0 and self.frequency_subchain is None:
            subchain = self.bigramchain
        if len(self.attribute_filters) != 0:
            if self.attribute_subchain is None:
                self.__apply_attribute_filters()
            subchain = self.attribute_subchain
        if self.frequency_filter is not None:
            self.apply_frequency_filter()
            subchain = self.frequency_subchain
        if self.reference_sequence is not None:
            subchain = subchain.clean(len(self.reference_sequence) - 1)
            subchain.set_startkeys(self.reference_sequence)
        else:
            warn(
                "No reference sequence was set. Ignore this message if this was intentional.")
            subchain.set_startkeys()
        for sequence in subchain.generate():
            if self.language_plugin.output_plain(sequence) in self.sequence_cache:
                pass
            else:
                self.sequence_cache.append(
                    self.language_plugin.output_plain(sequence))
                self.current_sequence = sequence
                self.apply_statistics()
                yield self.output_mode(sequence)

    def export_classic_pseudoword_matches_to_csv(
            self, pseudoword_matches: [Dict],
            csv_path: str) -> None:
        """
        Helper function to export generated pseudoword matches from generate_classic to CSV.
        The dictionairies from the matches are flattened before exporting to CSV.
        Parameters:
            pseudoword_matches: a dictionary of pseudoword matches retrieved from generate_classic

            csv_path: relative path to save csv file to (including the filename, e.g. ./pseudowords.csv)
        """
        def get_csv_headers(dictionary: dict):
            headers = []

            def flatten_nested_dict_keys(dictionary: dict, parent_dict_key=None):
                for key, value in dictionary.items():
                    key = str(key)
                    if isinstance(value, dict):
                        flatten_nested_dict_keys(
                            value, (parent_dict_key + "_" + key if parent_dict_key else key))
                    else:
                        if parent_dict_key:
                            headers.append((parent_dict_key + "_" + key))
                        else:
                            headers.append(key)
                return headers
            flatten_nested_dict_keys(dictionary)
            return headers

        def get_values_from_nested_dictionary(dictionary: dict):
            dict_vals = []

            def flatten_nested_dict_values(dictionary: dict):
                for value in dictionary.values():
                    if isinstance(value, dict):
                        flatten_nested_dict_values(value)
                    else:
                        dict_vals.append(value)
            flatten_nested_dict_values(dictionary)
            return dict_vals

        with open(csv_path, "w", newline='') as csvfile:
            file_writer = writer(csvfile)
            file_writer.writerow(get_csv_headers(pseudoword_matches[0]))
            for match in pseudoword_matches:
                file_writer.writerow(get_values_from_nested_dictionary(match))

Classes

class WuggyGenerator

Expand source code

class WuggyGenerator():
    def __init__(self):
        self.bigramchain = None
        self.bigramchains = {}
        self.supported_official_language_plugin_names = [
            "orthographic_basque",
            "orthographic_dutch",
            "orthographic_english",
            "orthographic_french",
            "orthographic_german",
            "orthographic_italian",
            "orthographic_polish",
            "orthographic_serbian_cyrillic",
            "orthographic_serbian_latin",
            "orthographic_spanish",
            "orthographic_vietnamese",
            "orthographic_estonian",
            "phonetic_english_celex",
            "phonetic_english_cmu",
            "phonetic_french",
            "phonetic_italian"]
        self.__official_language_plugin_repository_url = "https://raw.githubusercontent.com/WuggyCode/wuggy_language_plugin_data/master"
        self.attribute_subchain = None
        self.frequency_subchain = None
        self.reference_sequence = None
        self.frequency_filter = None
        self.current_sequence = None
        self.output_mode = None
        self.supported_statistics = ()
        self.supported_attribute_filters = {}
        self.attribute_filters = {}
        self.default_attributes = []
        self.statistics = {}
        self.word_lexicon = defaultdict(list)
        self.neighbor_lexicon = []
        self.reference_statistics = {}
        self.stat_cache = {}
        self.sequence_cache = []
        self.difference_statistics = {}
        self.match_statistics = {}
        self.lookup_lexicon = {}

    def load(self, language_plugin_name: str,
             local_language_plugin: BaseLanguagePlugin = None) -> None:
        """
        Loads in a language plugin, if available, and stores the corresponding bigramchains.
        Parameters:
            language_plugin_name: must be the exact string of an official language plugin (see self.supported_official_language_plugin_names). If you are loading in a local plugin, the name can be anything as long as it does not conflict with an already loaded plugin name.

            local_language_plugin: must be a child class of BaseLanguagePlugin: see BaseLanguagePlugin for more information on how to create a custom language plugin.
        """
        if local_language_plugin:
            # TODO: if someone does not pass a class INSTANCE, they get TypeError: <class 'type'> is a built-in class, this is a vague error and probably should be abstracted
            self.language_plugin_data_path = os.path.dirname(
                inspect.getfile(local_language_plugin.__class__))
            self.language_plugin_name = language_plugin_name
            language_plugin = local_language_plugin

        if local_language_plugin is None:
            if language_plugin_name not in self.supported_official_language_plugin_names:
                raise ValueError(
                    "This language is not officially supported by Wuggy at this moment. If this is a local plugin, pass the local_language_plugin")
            self.language_plugin_name = language_plugin_name
            language_plugins_folder_dirname = os.path.join(
                Path(__file__).parents[1], "plugins", "language_data")
            self.language_plugin_data_path = os.path.join(
                language_plugins_folder_dirname, language_plugin_name)
            if not os.path.exists(self.language_plugin_data_path):
                self.download_language_plugin(
                    language_plugin_name)
            # Official language plugins MUST have the class name "OfficialLanguagePlugin"!
            language_plugin = importlib.import_module(
                f".plugins.language_data.{language_plugin_name}.{language_plugin_name}",
                "wuggy").OfficialLanguagePlugin()

        if language_plugin_name not in self.bigramchains:
            default_data_path = os.path.join(
                self.language_plugin_data_path, language_plugin.default_data)

            data_file = codecs.open(default_data_path, 'r', encoding='utf-8')
            self.bigramchains[self.language_plugin_name] = BigramChain(
                language_plugin)
            self.bigramchains[self.language_plugin_name].load(
                data_file)
        self.__activate(self.language_plugin_name)

    @staticmethod
    def remove_downloaded_language_plugins() -> None:
        """
        Removes all downloaded (official) language plugins.
        Useful to cleanup after an experiment or to remove corrupt language plugins.
        """
        try:
            rmtree(os.path.join(Path(__file__).parents[1], "plugins", "language_data"))
        except FileNotFoundError as err:
            raise FileNotFoundError(
                "The official language plugin folder is already removed.") from err

    def download_language_plugin(
            self, language_plugin_name: str, auto_download=False) -> None:
        """
        Downloads and saves given language plugin to local storage from the corresponding official file repository.
        This method is called when you load in a language plugin automatically and you are missing the plugin locally.
        If you need to ensure your Wuggy script works on any machine without user confirmation, execute this method with the auto_download flag set to True before using the load method.
        Parameters:
            language_plugin_name: this is the name for the official language plugin you want to download. If the language name is not officially supported, the method will throw an error.

            auto_download: determines whether Wuggy provides the user with a prompt to confirm downloading a language plugin.
        """
        if language_plugin_name not in self.supported_official_language_plugin_names:
            raise ValueError("This language is not officially supported by Wuggy at this moment.")

        if not auto_download:
            while True:
                stdout.write(
                    f"The language plugin {language_plugin_name} was not found in local storage. Do you allow Wuggy to download this plugin? [y/n]\n")
                choice = input().lower()
                if (not (choice.startswith("y") or choice.startswith("n"))):
                    stdout.write("Please respond with 'y' or 'n'")
                elif choice.startswith("n"):
                    raise ValueError(
                        "User declined permission for Wuggy to download necessary language plugin.")
                else:
                    break

        language_plugins_folder_dirname = os.path.join(
            Path(__file__).parents[1], "plugins", "language_data")
        if not os.path.exists(language_plugins_folder_dirname):
            os.makedirs(language_plugins_folder_dirname)

        self.language_plugin_data_path = os.path.join(
            language_plugins_folder_dirname, language_plugin_name)
        if not os.path.exists(self.language_plugin_data_path):
            os.makedirs(self.language_plugin_data_path)

        print(
            f"Wuggy is currently downloading the plugin {language_plugin_name} for you from the official repository...")
        
        py_file_name = f"{language_plugin_name}.py"
        print(f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")
        py_file = urlopen(
            f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")

        file = open(f'{self.language_plugin_data_path}/{py_file_name}',
                    'w', encoding="utf-8")
        # The current setup assumes that every official Wuggy language plugin use a single data file
        for line in py_file:
            file.write(line.decode("utf-8"))
        data_file_name = f"{language_plugin_name}.txt"
        data_file = urlopen(
            f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{data_file_name}")
        file = open(f'{self.language_plugin_data_path}/{data_file_name}',
                    'w', encoding="utf-8")

        for line in data_file:
            file.write(line.decode("utf-8"))

        print(f"Wuggy has finished downloading {language_plugin_name}.")

    def __activate(self, name: str) -> None:
        """
        Activate a language plugin by setting the corresponding bigramchains and lexicon properties.
        This deactivates and garbage collects any previously activated language plugin.
        Should only be called internally, do not call on your own.
        """
        if isinstance(name, type(codecs)):
            name = name.__name__
        self.bigramchain = self.bigramchains[name]
        self.language_plugin = self.bigramchain.language_plugin
        self.__load_neighbor_lexicon()
        self.__load_word_lexicon()
        self.__load_lookup_lexicon()
        self.supported_statistics = self.__get_statistics()
        self.supported_attribute_filters = self.__get_attributes()
        self.default_attributes = self.__get_default_attributes()
        self.current_language_plugin_name = name

    def __load_word_lexicon(self) -> None:
        """
        Loads the default word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        cutoff = 0
        data_file = codecs.open(
            "%s/%s" % (self.language_plugin_data_path, self.language_plugin.default_word_lexicon),
            'r', encoding="utf-8")
        self.word_lexicon = defaultdict(list)
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split('\t')
            word = fields[0]
            frequency_per_million = fields[-1]
            if float(frequency_per_million) >= cutoff:
                self.word_lexicon[word[0], len(word)].append(word)
        data_file.close()

    def __load_neighbor_lexicon(self) -> None:
        """
        Loads the default neighbor word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        cutoff = 0
        data_file = codecs.open(
            "%s/%s" %
            (self.language_plugin_data_path,
             self.language_plugin.default_neighbor_lexicon),
            'r',
            encoding="utf-8")
        self.neighbor_lexicon = []
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split('\t')
            word = fields[0]
            frequency_per_million = fields[-1]
            if float(frequency_per_million) >= cutoff:
                self.neighbor_lexicon.append(word)
        data_file.close()

    def __load_lookup_lexicon(self, data_file: bool = None) -> None:
        """
        Loads the default lookup word lexicon for the currently set language plugin.
        This is currently used internally by __activate only, do not call on your own.
        """
        self.lookup_lexicon = {}
        if data_file is None:
            data_file = codecs.open(
                "%s/%s" %
                (self.language_plugin_data_path,
                 self.language_plugin.default_lookup_lexicon),
                'r',
                encoding="utf-8")
        lines = data_file.readlines()
        for line in lines:
            fields = line.strip().split(self.language_plugin.separator)
            reference, representation = fields[0:2]
            self.lookup_lexicon[reference] = representation
        data_file.close()

    def lookup_reference_segments(self, reference: str) -> Optional[str]:
        """
        Look up a given reference (word) from the currently active lookup lexicon.
        Returns the segments of the found word, if the word is not found it returns None.
        This should be used before setting a word as a reference sequence.
        """
        return self.lookup_lexicon.get(reference, None)

    def __get_attributes(self) -> [namedtuple]:
        """
        Returns a list of all attribute fields of the currently activated language plugin as a named tuple.
        This should only be used internally, read the property "supported_attribute_filters" instead.
        """
        return self.language_plugin.Segment._fields

    def __get_default_attributes(self) -> [str]:
        """
        Returns a list of default attribute fields of the currently activated language plugin.
        This should only be used internally, read the property "default_attributes" instead.
        """
        return self.language_plugin.default_fields

    @_loaded_language_plugin_required
    def set_reference_sequence(self, sequence: str) -> None:
        """
        Set the reference sequence.
        This is commonly used before generate methods in order to set the reference word for which pseudowords should be generated.
        """
        self.reference_sequence = self.language_plugin.transform(
            sequence).representation
        self.reference_sequence_frequencies = self.bigramchain.get_frequencies(
            self.reference_sequence)
        self.__clear_stat_cache()
        for name in self.__get_statistics():
            function = eval("self.language_plugin.statistic_%s" % (name))
            self.reference_statistics[name] = function(
                self, self.reference_sequence)

    def __get_statistics(self) -> [str]:
        """
        Lists all statistics supported by a given language plugin.
        This should only be used internally, read the property "supported_statistics" instead.
        """
        names = [name for name in dir(
            self.language_plugin) if name.startswith('statistic')]
        return [name.replace('statistic_', '') for name in names]

    def set_statistic(self, name: str) -> None:
        """
        Enable a statistic based on its name.
        """
        if name not in self.supported_statistics:
            raise ValueError(f"Statistic {name} is not supported.")
        self.statistics[name] = None

    def set_statistics(self, names: [str]) -> None:
        """
        Enables statistics based on their names.
        """
        for name in names:
            if name not in self.supported_statistics:
                self.statistics = {}
                raise ValueError(f"Statistic {name} is not supported.")
            self.statistics[name] = None

    def set_all_statistics(self) -> None:
        """
        Enable all statistics supported by the current active language plugin.
        Enabling all statistics increases word generation computation time, especially for statistics such as ned1.
        """
        self.set_statistics(self.supported_statistics)

    def apply_statistics(self, sequence: str = None) -> None:
        """
        Apply all statistics which were set beforehand.
        """
        if sequence is None:
            sequence = self.current_sequence
        for name in self.statistics:
            function = eval("self.language_plugin.statistic_%s" % (name))
            if (sequence, name) in self.stat_cache:
                self.statistics[name] = self.stat_cache[(sequence, name)]
            else:
                self.statistics[name] = function(self, sequence)
                self.stat_cache[(sequence, name)] = self.statistics[name]
            if 'match' in function.__dict__:
                self.match_statistics[name] = function.match(
                    self.statistics[name], self.reference_statistics[name])
            if 'difference' in function.__dict__:
                self.difference_statistics[name] = function.difference(
                    self.statistics[name], self.reference_statistics[name])

    def clear_statistics(self) -> None:
        """
        Clear all the statistics set previously.
        """
        self.statistics = {}

    def __clear_stat_cache(self) -> None:
        """
        Clears the statistics cache. Only used by Wuggy internally.
        """
        self.stat_cache = {}

    def __clear_sequence_cache(self) -> None:
        """
        Clears the sequence cache. Only used by Wuggy internally.
        """
        self.sequence_cache = []

    def list_output_modes(self) -> [str]:
        """
        List output modes of the currently activated language plugin.
        """
        names = [name for name in dir(
            self.language_plugin) if name.startswith('output')]
        return [name.replace('output_', '') for name in names]

    def set_output_mode(self, name: str) -> None:
        """
        Set an output mode supported by the currently activated language plugin.
        """
        if name not in self.list_output_modes():
            raise ValueError(f"Output mode {name} is not supported.")
        self.output_mode = eval("self.language_plugin.output_%s" % (name))

    def set_attribute_filter(self, name: str) -> None:
        """
        Set an attribute filter supported by the currently activated language plugin.
        """
        reference_sequence = self.reference_sequence
        if name not in self.supported_attribute_filters:
            raise ValueError(
                f"Attribute filter {name} is not supported.")
        self.attribute_filters[name] = reference_sequence
        self.attribute_subchain = None

    def set_attribute_filters(self, names: [str]) -> None:
        """
        Set attribute filters supported by the currently activated language plugin.
        """
        for name in names:
            self.set_attribute_filter(name)

    def __apply_attribute_filters(self) -> None:
        """
        Apply all set attribute filters.
        This is currently used by Wuggy internally, do not call on your own.
        """
        for attribute, reference_sequence in self.attribute_filters.items():
            subchain = self.attribute_subchain if self.attribute_subchain is not None else self.bigramchain
            self.attribute_subchain = subchain.attribute_filter(
                reference_sequence, attribute)

    def clear_attribute_filters(self) -> None:
        """
        Remove all set attribute filters.
        """
        self.attribute_filters = {}

    def set_frequency_filter(self, lower: int, upper: int) -> None:
        """
        Sets the frequency filter for concentric search.
        Stricter search (small values for lower and upper) result in faster word generation.
        """
        self.frequency_filter = (self.reference_sequence, lower, upper)

    def clear_frequency_filter(self) -> None:
        """
        Clear the previously set frequency filter.
        """
        self.frequency_filter = None
        self.frequency_subchain = None

    def apply_frequency_filter(self) -> None:
        """
        Apply the previously set frequency filter.
        """
        if self.frequency_filter is None:
            raise Exception("No frequency filter was set")
        reference_sequence, lower, upper = self.frequency_filter
        subchain = self.attribute_subchain if self.attribute_subchain is not None else self.bigramchain
        self.frequency_subchain = subchain.frequency_filter(
            reference_sequence, lower, upper)

    @_loaded_language_plugin_required
    def generate_classic(
            self, input_sequences: [str],
            ncandidates_per_sequence: int = 10, max_search_time_per_sequence: int = 10,
            subsyllabic_segment_overlap_ratio: Union[Fraction, None] = Fraction(2, 3),
            match_subsyllabic_segment_length: bool = True, match_letter_length: bool = True,
            output_mode: str = "plain", concentric_search: bool = True) -> [Dict]:
        """
        This is the classic method to generate pseudowords using Wuggy and can be called immediately after loading a language plugin.
        The defaults for this method are similar to those set in the legacy version of Wuggy, resulting in sensible pseudowords.
        This method returns a list of pseudoword matches, including all match and difference statistics (lexicality, ned1, old2, plain_length, deviation statistics...).
        Beware that this method always clears the sequence cache and all previously set filters.
        Parameters:
            input_sequences: these are the input sequences (words) for which you want to generate pseudowords.

            ncandidates_per_sequence: this is the n (maximum) amount of pseudowords you want to generate per input sequence.

            max_search_time_per_sequence: this is the maximum time in seconds to search for pseudowords per input sequence.

            subsyllabic_segment_overlap_ratio: this is the Fraction ratio for overlap between subsyllabic segments. The default ensures your pseudowords are very word-like but not easily identifiable as related to an existing word. If set to None, this constraint is not applied.

            match_subsyllabic_segment_length: determines whether the generated pseudowords must retain the same subsyllabic segment length as the respective input sequence.

            match_letter_length: determines whether the generated pseudowords must retain the same word length as the respective input sequence. This option is redundant if match_subsyllabic_segment_length is set to True.

            output_mode: output mode for pseudowords, constricted by the output modes supported by the currently loaded language plugin.

            concentric_search: enable/disable concentric search. Wuggy operates best and fastest when concentric search is enabled. First, the algorithm will try to generate candidates that exactly match the transition frequencies of the reference word. Then the maximal allowed deviation in transition frequencies will increase by powers of 2 (i.e., +/-2, +/-4, +/-8, etc.).
        .. include:: ../../documentation/wuggygenerator/generate_classic.md
        """
        pseudoword_matches = []
        for input_sequence in input_sequences:
            pseudoword_matches.extend(
                self.__generate_classic_inner(
                    input_sequence,
                    ncandidates_per_sequence,
                    max_search_time_per_sequence,
                    subsyllabic_segment_overlap_ratio,
                    match_subsyllabic_segment_length,
                    match_letter_length, output_mode, concentric_search))
        return pseudoword_matches

    def __generate_classic_inner(
            self, input_sequence: str, ncandidates_per_sequence: int, max_search_time: int,
            subsyllabic_segment_overlap_ratio: Union[Fraction, None],
            match_subsyllabic_segment_length: bool, match_letter_length: bool, output_mode: str,
            concentric_search: bool = True):
        """
        Inner method for generate_classic(), which outputs a list of pseudoword matches for an input sequence.
        Should only be used by WuggyGenerator internally.
        """
        self.__clear_sequence_cache()
        self.clear_attribute_filters()
        self.clear_frequency_filter()
        input_sequence_segments = self.lookup_reference_segments(input_sequence)
        if input_sequence_segments is None:
            raise Exception(
                f"Sequence {input_sequence} was not found in lexicon {self.current_language_plugin_name}")
        self.set_reference_sequence(input_sequence_segments)
        self.set_output_mode(output_mode)
        subchain = self.bigramchain
        starttime = time()
        pseudoword_matches = []
        frequency_exponent = 1
        if match_subsyllabic_segment_length:
            self.set_attribute_filter("segment_length")
            self.__apply_attribute_filters()
            subchain = self.attribute_subchain
        while True:
            if concentric_search:
                self.set_frequency_filter(
                    2**frequency_exponent, 2**frequency_exponent)
                frequency_exponent += 1
                self.apply_frequency_filter()
                subchain = self.frequency_subchain
            subchain = subchain.clean(len(self.reference_sequence) - 1)
            subchain.set_startkeys(self.reference_sequence)
            for sequence in subchain.generate():
                # Mandatory statistics before finding a suitable match
                self.clear_statistics()
                self.set_statistics(["overlap_ratio", "plain_length", "lexicality"])
                if (time() - starttime) >= max_search_time:
                    return pseudoword_matches
                if self.language_plugin.output_plain(sequence) in self.sequence_cache:
                    continue
                self.current_sequence = sequence
                self.apply_statistics()
                if (not match_subsyllabic_segment_length and match_letter_length and self.difference_statistics["plain_length"] != 0):
                    continue
                if (subsyllabic_segment_overlap_ratio is not None and self.statistics["overlap_ratio"] !=
                        subsyllabic_segment_overlap_ratio):
                    continue
                if self.statistics["lexicality"] == "W":
                    continue
                # (Re)apply all statistics only if match is found: else search becomes unnecessarily slow
                self.set_all_statistics()
                self.apply_statistics()
                self.sequence_cache.append(
                    self.language_plugin.output_plain(sequence))
                match = {"word": input_sequence,
                         "segments": input_sequence_segments,
                         "pseudoword": self.output_mode(sequence)}
                match.update({"statistics": self.statistics,
                              "difference_statistics": self.difference_statistics})

                pseudoword_matches.append(copy.deepcopy(match))
                if len(pseudoword_matches) >= ncandidates_per_sequence:
                    return pseudoword_matches

    @_loaded_language_plugin_required_generator
    def generate_advanced(self, clear_cache: bool = True) -> Union[Generator[str, None, None],
                                                                   Generator[tuple, None, None]]:
        """
        Creates a custom generator which can be iterated to return generated pseudowords.
        The generator's settings, such as output statistics, should be set by you before calling this method.
        If attributes such as \"output_mode\" are not set, sensible defaults are used.
        Note that this method is for advanced users and may result in unexpected results if handled incorrectly.
        .. include:: ../../documentation/wuggygenerator/generate_advanced.md
        """
        if clear_cache:
            self.__clear_sequence_cache()
        if self.output_mode is None:
            self.set_output_mode("plain")
        if len(self.attribute_filters) == 0 and self.frequency_subchain is None:
            subchain = self.bigramchain
        if len(self.attribute_filters) != 0:
            if self.attribute_subchain is None:
                self.__apply_attribute_filters()
            subchain = self.attribute_subchain
        if self.frequency_filter is not None:
            self.apply_frequency_filter()
            subchain = self.frequency_subchain
        if self.reference_sequence is not None:
            subchain = subchain.clean(len(self.reference_sequence) - 1)
            subchain.set_startkeys(self.reference_sequence)
        else:
            warn(
                "No reference sequence was set. Ignore this message if this was intentional.")
            subchain.set_startkeys()
        for sequence in subchain.generate():
            if self.language_plugin.output_plain(sequence) in self.sequence_cache:
                pass
            else:
                self.sequence_cache.append(
                    self.language_plugin.output_plain(sequence))
                self.current_sequence = sequence
                self.apply_statistics()
                yield self.output_mode(sequence)

    def export_classic_pseudoword_matches_to_csv(
            self, pseudoword_matches: [Dict],
            csv_path: str) -> None:
        """
        Helper function to export generated pseudoword matches from generate_classic to CSV.
        The dictionairies from the matches are flattened before exporting to CSV.
        Parameters:
            pseudoword_matches: a dictionary of pseudoword matches retrieved from generate_classic

            csv_path: relative path to save csv file to (including the filename, e.g. ./pseudowords.csv)
        """
        def get_csv_headers(dictionary: dict):
            headers = []

            def flatten_nested_dict_keys(dictionary: dict, parent_dict_key=None):
                for key, value in dictionary.items():
                    key = str(key)
                    if isinstance(value, dict):
                        flatten_nested_dict_keys(
                            value, (parent_dict_key + "_" + key if parent_dict_key else key))
                    else:
                        if parent_dict_key:
                            headers.append((parent_dict_key + "_" + key))
                        else:
                            headers.append(key)
                return headers
            flatten_nested_dict_keys(dictionary)
            return headers

        def get_values_from_nested_dictionary(dictionary: dict):
            dict_vals = []

            def flatten_nested_dict_values(dictionary: dict):
                for value in dictionary.values():
                    if isinstance(value, dict):
                        flatten_nested_dict_values(value)
                    else:
                        dict_vals.append(value)
            flatten_nested_dict_values(dictionary)
            return dict_vals

        with open(csv_path, "w", newline='') as csvfile:
            file_writer = writer(csvfile)
            file_writer.writerow(get_csv_headers(pseudoword_matches[0]))
            for match in pseudoword_matches:
                file_writer.writerow(get_values_from_nested_dictionary(match))

Static methods

def remove_downloaded_language_plugins() ‑> None

Removes all downloaded (official) language plugins. Useful to cleanup after an experiment or to remove corrupt language plugins.

Expand source code

@staticmethod
def remove_downloaded_language_plugins() -> None:
    """
    Removes all downloaded (official) language plugins.
    Useful to cleanup after an experiment or to remove corrupt language plugins.
    """
    try:
        rmtree(os.path.join(Path(__file__).parents[1], "plugins", "language_data"))
    except FileNotFoundError as err:
        raise FileNotFoundError(
            "The official language plugin folder is already removed.") from err

Methods

def apply_frequency_filter(self) ‑> None

Apply the previously set frequency filter.

Expand source code

def apply_frequency_filter(self) -> None:
    """
    Apply the previously set frequency filter.
    """
    if self.frequency_filter is None:
        raise Exception("No frequency filter was set")
    reference_sequence, lower, upper = self.frequency_filter
    subchain = self.attribute_subchain if self.attribute_subchain is not None else self.bigramchain
    self.frequency_subchain = subchain.frequency_filter(
        reference_sequence, lower, upper)

def apply_statistics(self, sequence: str = None) ‑> None

Apply all statistics which were set beforehand.

Expand source code

def apply_statistics(self, sequence: str = None) -> None:
    """
    Apply all statistics which were set beforehand.
    """
    if sequence is None:
        sequence = self.current_sequence
    for name in self.statistics:
        function = eval("self.language_plugin.statistic_%s" % (name))
        if (sequence, name) in self.stat_cache:
            self.statistics[name] = self.stat_cache[(sequence, name)]
        else:
            self.statistics[name] = function(self, sequence)
            self.stat_cache[(sequence, name)] = self.statistics[name]
        if 'match' in function.__dict__:
            self.match_statistics[name] = function.match(
                self.statistics[name], self.reference_statistics[name])
        if 'difference' in function.__dict__:
            self.difference_statistics[name] = function.difference(
                self.statistics[name], self.reference_statistics[name])

def clear_attribute_filters(self) ‑> None

Remove all set attribute filters.

Expand source code

def clear_attribute_filters(self) -> None:
    """
    Remove all set attribute filters.
    """
    self.attribute_filters = {}

def clear_frequency_filter(self) ‑> None

Clear the previously set frequency filter.

Expand source code

def clear_frequency_filter(self) -> None:
    """
    Clear the previously set frequency filter.
    """
    self.frequency_filter = None
    self.frequency_subchain = None

def clear_statistics(self) ‑> None

Clear all the statistics set previously.

Expand source code

def clear_statistics(self) -> None:
    """
    Clear all the statistics set previously.
    """
    self.statistics = {}

def download_language_plugin(self, language_plugin_name: str, auto_download=False) ‑> None

Downloads and saves given language plugin to local storage from the corresponding official file repository. This method is called when you load in a language plugin automatically and you are missing the plugin locally. If you need to ensure your Wuggy script works on any machine without user confirmation, execute this method with the auto_download flag set to True before using the load method.

Parameters

language_plugin_name: this is the name for the official language plugin you want to download. If the language name is not officially supported, the method will throw an error.

auto_download: determines whether Wuggy provides the user with a prompt to confirm downloading a language plugin.

Expand source code

def download_language_plugin(
        self, language_plugin_name: str, auto_download=False) -> None:
    """
    Downloads and saves given language plugin to local storage from the corresponding official file repository.
    This method is called when you load in a language plugin automatically and you are missing the plugin locally.
    If you need to ensure your Wuggy script works on any machine without user confirmation, execute this method with the auto_download flag set to True before using the load method.
    Parameters:
        language_plugin_name: this is the name for the official language plugin you want to download. If the language name is not officially supported, the method will throw an error.

        auto_download: determines whether Wuggy provides the user with a prompt to confirm downloading a language plugin.
    """
    if language_plugin_name not in self.supported_official_language_plugin_names:
        raise ValueError("This language is not officially supported by Wuggy at this moment.")

    if not auto_download:
        while True:
            stdout.write(
                f"The language plugin {language_plugin_name} was not found in local storage. Do you allow Wuggy to download this plugin? [y/n]\n")
            choice = input().lower()
            if (not (choice.startswith("y") or choice.startswith("n"))):
                stdout.write("Please respond with 'y' or 'n'")
            elif choice.startswith("n"):
                raise ValueError(
                    "User declined permission for Wuggy to download necessary language plugin.")
            else:
                break

    language_plugins_folder_dirname = os.path.join(
        Path(__file__).parents[1], "plugins", "language_data")
    if not os.path.exists(language_plugins_folder_dirname):
        os.makedirs(language_plugins_folder_dirname)

    self.language_plugin_data_path = os.path.join(
        language_plugins_folder_dirname, language_plugin_name)
    if not os.path.exists(self.language_plugin_data_path):
        os.makedirs(self.language_plugin_data_path)

    print(
        f"Wuggy is currently downloading the plugin {language_plugin_name} for you from the official repository...")
    
    py_file_name = f"{language_plugin_name}.py"
    print(f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")
    py_file = urlopen(
        f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{py_file_name}")

    file = open(f'{self.language_plugin_data_path}/{py_file_name}',
                'w', encoding="utf-8")
    # The current setup assumes that every official Wuggy language plugin use a single data file
    for line in py_file:
        file.write(line.decode("utf-8"))
    data_file_name = f"{language_plugin_name}.txt"
    data_file = urlopen(
        f"{self.__official_language_plugin_repository_url}/{language_plugin_name}/{data_file_name}")
    file = open(f'{self.language_plugin_data_path}/{data_file_name}',
                'w', encoding="utf-8")

    for line in data_file:
        file.write(line.decode("utf-8"))

    print(f"Wuggy has finished downloading {language_plugin_name}.")

def export_classic_pseudoword_matches_to_csv(self, pseudoword_matches: [typing.Dict], csv_path: str) ‑> None

Helper function to export generated pseudoword matches from generate_classic to CSV. The dictionairies from the matches are flattened before exporting to CSV.

Parameters

pseudoword_matches: a dictionary of pseudoword matches retrieved from generate_classic

csv_path: relative path to save csv file to (including the filename, e.g. ./pseudowords.csv)

Expand source code

def export_classic_pseudoword_matches_to_csv(
        self, pseudoword_matches: [Dict],
        csv_path: str) -> None:
    """
    Helper function to export generated pseudoword matches from generate_classic to CSV.
    The dictionairies from the matches are flattened before exporting to CSV.
    Parameters:
        pseudoword_matches: a dictionary of pseudoword matches retrieved from generate_classic

        csv_path: relative path to save csv file to (including the filename, e.g. ./pseudowords.csv)
    """
    def get_csv_headers(dictionary: dict):
        headers = []

        def flatten_nested_dict_keys(dictionary: dict, parent_dict_key=None):
            for key, value in dictionary.items():
                key = str(key)
                if isinstance(value, dict):
                    flatten_nested_dict_keys(
                        value, (parent_dict_key + "_" + key if parent_dict_key else key))
                else:
                    if parent_dict_key:
                        headers.append((parent_dict_key + "_" + key))
                    else:
                        headers.append(key)
            return headers
        flatten_nested_dict_keys(dictionary)
        return headers

    def get_values_from_nested_dictionary(dictionary: dict):
        dict_vals = []

        def flatten_nested_dict_values(dictionary: dict):
            for value in dictionary.values():
                if isinstance(value, dict):
                    flatten_nested_dict_values(value)
                else:
                    dict_vals.append(value)
        flatten_nested_dict_values(dictionary)
        return dict_vals

    with open(csv_path, "w", newline='') as csvfile:
        file_writer = writer(csvfile)
        file_writer.writerow(get_csv_headers(pseudoword_matches[0]))
        for match in pseudoword_matches:
            file_writer.writerow(get_values_from_nested_dictionary(match))

def generate_advanced(self, clear_cache: bool = True) ‑> Union[Generator[str, None, None], Generator[tuple, None, None]]

Creates a custom generator which can be iterated to return generated pseudowords. The generator's settings, such as output statistics, should be set by you before calling this method. If attributes such as "output_mode" are not set, sensible defaults are used. Note that this method is for advanced users and may result in unexpected results if handled incorrectly.

Generate Advanced Examples

In what format are pseudowords returned?

Using the advanced generate method, Wuggy will return a generator which you can iterate over to generate pseudowords, e.g:

from wuggy import WuggyGenerator

g = WuggyGenerator()
g.load("orthographic_english")
g.set_reference_sequence("balloon")
for sequence in g.generate_advanced(clear_cache=False):
    print(sequence)

(note that this example returns rather useless pseudowords since there are no restrictions set)

Generating Pseudowords (with sensible settings)

Generating pseudowords using this method requires good knowledge of Wuggy in order to generate pseudowords which, for example, closely resemble the origin reference word.

The following example uses advanced generation to set a number of restrictions on generated pseudowords.

Each origin word will generate a maximum of 10 pseudowords
Each pseudoword must be a non-word
Each pseudoword must overlap 2/3 subsyllabic segments
The frequency filter is increased if not enough matches are found within a given band (this is concentric search)
Sensible attribute filters are enforced

from fractions import Fraction

from wuggy import WuggyGenerator

words = ["trumpet", "car"]
g = WuggyGenerator()
g.load("orthographic_english")
ncandidates = 10
for word in words:
    g.set_reference_sequence(g.lookup_reference_segments(word))
    g.set_attribute_filter('sequence_length')
    g.set_attribute_filter('segment_length')
    g.set_statistic('overlap_ratio')
    g.set_statistic('plain_length')
    g.set_statistic('transition_frequencies')
    g.set_statistic('lexicality')
    g.set_statistic('ned1')
    g.set_output_mode('syllabic')
    j = 0
    for i in range(1, 10, 1):
        g.set_frequency_filter(2**i, 2**i)
        for sequence in g.generate_advanced(clear_cache=False):
            match = False
            if (g.statistics['overlap_ratio'] == Fraction(2, 3) and
                    g.statistics['lexicality'] == "N"):
                match = True
            if match == True:
                print(sequence)
                j = j+1
                if j > ncandidates:
                    break
        if j > ncandidates:
            break

Note how using generate_advanced requires setting many parameters and candidate check logic yourself. Make sure that generate_classic does not suit your needs before using this method, as its low level nature makes it easy to return pseudowords which do not fit your needs.

Expand source code

@_loaded_language_plugin_required_generator
def generate_advanced(self, clear_cache: bool = True) -> Union[Generator[str, None, None],
                                                               Generator[tuple, None, None]]:
    """
    Creates a custom generator which can be iterated to return generated pseudowords.
    The generator's settings, such as output statistics, should be set by you before calling this method.
    If attributes such as \"output_mode\" are not set, sensible defaults are used.
    Note that this method is for advanced users and may result in unexpected results if handled incorrectly.
    .. include:: ../../documentation/wuggygenerator/generate_advanced.md
    """
    if clear_cache:
        self.__clear_sequence_cache()
    if self.output_mode is None:
        self.set_output_mode("plain")
    if len(self.attribute_filters) == 0 and self.frequency_subchain is None:
        subchain = self.bigramchain
    if len(self.attribute_filters) != 0:
        if self.attribute_subchain is None:
            self.__apply_attribute_filters()
        subchain = self.attribute_subchain
    if self.frequency_filter is not None:
        self.apply_frequency_filter()
        subchain = self.frequency_subchain
    if self.reference_sequence is not None:
        subchain = subchain.clean(len(self.reference_sequence) - 1)
        subchain.set_startkeys(self.reference_sequence)
    else:
        warn(
            "No reference sequence was set. Ignore this message if this was intentional.")
        subchain.set_startkeys()
    for sequence in subchain.generate():
        if self.language_plugin.output_plain(sequence) in self.sequence_cache:
            pass
        else:
            self.sequence_cache.append(
                self.language_plugin.output_plain(sequence))
            self.current_sequence = sequence
            self.apply_statistics()
            yield self.output_mode(sequence)

def generate_classic(self, input_sequences: [], ncandidates_per_sequence: int = 10, max_search_time_per_sequence: int = 10, subsyllabic_segment_overlap_ratio: Optional[fractions.Fraction] = Fraction(2, 3), match_subsyllabic_segment_length: bool = True, match_letter_length: bool = True, output_mode: str = 'plain', concentric_search: bool = True) ‑> [typing.Dict]

This is the classic method to generate pseudowords using Wuggy and can be called immediately after loading a language plugin. The defaults for this method are similar to those set in the legacy version of Wuggy, resulting in sensible pseudowords. This method returns a list of pseudoword matches, including all match and difference statistics (lexicality, ned1, old2, plain_length, deviation statistics…). Beware that this method always clears the sequence cache and all previously set filters.

Parameters

input_sequences: these are the input sequences (words) for which you want to generate pseudowords.

ncandidates_per_sequence: this is the n (maximum) amount of pseudowords you want to generate per input sequence.

max_search_time_per_sequence: this is the maximum time in seconds to search for pseudowords per input sequence.

subsyllabic_segment_overlap_ratio: this is the Fraction ratio for overlap between subsyllabic segments. The default ensures your pseudowords are very word-like but not easily identifiable as related to an existing word. If set to None, this constraint is not applied.

match_subsyllabic_segment_length: determines whether the generated pseudowords must retain the same subsyllabic segment length as the respective input sequence.

match_letter_length: determines whether the generated pseudowords must retain the same word length as the respective input sequence. This option is redundant if match_subsyllabic_segment_length is set to True.

output_mode: output mode for pseudowords, constricted by the output modes supported by the currently loaded language plugin.

concentric_search: enable/disable concentric search. Wuggy operates best and fastest when concentric search is enabled. First, the algorithm will try to generate candidates that exactly match the transition frequencies of the reference word. Then the maximal allowed deviation in transition frequencies will increase by powers of 2 (i.e., +/-2, +/-4, +/-8, etc.).

Generate Classic Examples

In what format are pseudowords returned?

Pseudowords are returned in a dictionary format in a verbose format, containing details such as statistics. Below is an example return value for a pseudoword generated for car.

{
    "word": "car",
    "segments": "car",
    "pseudoword": "cag",
    "statistics": {
        "lexicality": "N",
        "ned1": 24,
        "old20": 1.0,
        "overlap": 2,
        "overlap_ratio": Fraction(2, 3),
        "plain_length": 1,
        "transition_frequencies": {0: 304, 1: 92, 2: 22, 3: 80},
    },
    "difference_statistics": {
        "ned1": -4,
        "old20": 0.050000000000000044,
        "plain_length": 0,
        "transition_frequencies": {0: 0, 1: 0, 2: 8, 3: -11},
    },
}

Generating pseudowords (default settings)

In this example, we will generate pseudowords for the English words car and bicycle. We will print these pseudowords to the console.

from wuggy import WuggyGenerator

g = WuggyGenerator()
g.load("orthographic_english")
for match in g.generate_classic(["car", "bicycle"]):
    print(match["pseudoword"])

The code above first loads the orthographic_english language plugin. After this, the generate_classic method is called with a list of reference sequences for which we want to generate pseudowords. The method returns a list of pseudoword matches. These matches consist of dictionairies with many details about the match, such as relevant statistics. Since we are only interested in the generated pseudowords, we print the value assigned to the key pseudoword.

Generating pseudowords (custom settings)

In this example, we will generate pseudowords for the English words car and bicycle, this time using some custom settings. The generate_classic method takes several optional arguments which can be used to change the output of the generator. The defaults are usually great for generating useful pseudowords, so this example will only change two parameters.

from wuggy import WuggyGenerator

g = WuggyGenerator()
g.load("orthographic_english")
for match in g.generate_classic(
    ["car", "bicycle"],
    ncandidates_per_sequence=30, max_search_time_per_sequence=25):
    print(match["pseudoword"])

The code above will ensure that, per sequence in the input list, a maximum of 30 candidates will be generated. By default, Wuggy only has 10 seconds to find this amount of candidates per sequence. For this reason, we can set the max_search_time_per_sequence to a higher amount to ensure that 30 sequences can be generated in time.

Generating pseudowords and exporting to CSV

Since Wuggy is a Python library, its output can be easily used by other modules to perform actions such as exporting pseudowords to CSV. This can be done manually, although Wuggy includes a built-in helper method to easily export classic pseudoword matches to a CSV file:

from csv import DictWriter

from wuggy.generators.wuggygenerator import WuggyGenerator

g = WuggyGenerator()
g.load("orthographic_english")
pseudoword_matches = g.generate_classic(["car"])
g.export_classic_pseudoword_matches_to_csv(pseudoword_matches, "./pseudowords.csv")

By using this method, the nested dictionary will be flattened so that the resulting CSV can be easily interpreted by your software of choice.

Expand source code

@_loaded_language_plugin_required
def generate_classic(
        self, input_sequences: [str],
        ncandidates_per_sequence: int = 10, max_search_time_per_sequence: int = 10,
        subsyllabic_segment_overlap_ratio: Union[Fraction, None] = Fraction(2, 3),
        match_subsyllabic_segment_length: bool = True, match_letter_length: bool = True,
        output_mode: str = "plain", concentric_search: bool = True) -> [Dict]:
    """
    This is the classic method to generate pseudowords using Wuggy and can be called immediately after loading a language plugin.
    The defaults for this method are similar to those set in the legacy version of Wuggy, resulting in sensible pseudowords.
    This method returns a list of pseudoword matches, including all match and difference statistics (lexicality, ned1, old2, plain_length, deviation statistics...).
    Beware that this method always clears the sequence cache and all previously set filters.
    Parameters:
        input_sequences: these are the input sequences (words) for which you want to generate pseudowords.

        ncandidates_per_sequence: this is the n (maximum) amount of pseudowords you want to generate per input sequence.

        max_search_time_per_sequence: this is the maximum time in seconds to search for pseudowords per input sequence.

        subsyllabic_segment_overlap_ratio: this is the Fraction ratio for overlap between subsyllabic segments. The default ensures your pseudowords are very word-like but not easily identifiable as related to an existing word. If set to None, this constraint is not applied.

        match_subsyllabic_segment_length: determines whether the generated pseudowords must retain the same subsyllabic segment length as the respective input sequence.

        match_letter_length: determines whether the generated pseudowords must retain the same word length as the respective input sequence. This option is redundant if match_subsyllabic_segment_length is set to True.

        output_mode: output mode for pseudowords, constricted by the output modes supported by the currently loaded language plugin.

        concentric_search: enable/disable concentric search. Wuggy operates best and fastest when concentric search is enabled. First, the algorithm will try to generate candidates that exactly match the transition frequencies of the reference word. Then the maximal allowed deviation in transition frequencies will increase by powers of 2 (i.e., +/-2, +/-4, +/-8, etc.).
    .. include:: ../../documentation/wuggygenerator/generate_classic.md
    """
    pseudoword_matches = []
    for input_sequence in input_sequences:
        pseudoword_matches.extend(
            self.__generate_classic_inner(
                input_sequence,
                ncandidates_per_sequence,
                max_search_time_per_sequence,
                subsyllabic_segment_overlap_ratio,
                match_subsyllabic_segment_length,
                match_letter_length, output_mode, concentric_search))
    return pseudoword_matches

def list_output_modes(self) ‑> []

List output modes of the currently activated language plugin.

Expand source code

def list_output_modes(self) -> [str]:
    """
    List output modes of the currently activated language plugin.
    """
    names = [name for name in dir(
        self.language_plugin) if name.startswith('output')]
    return [name.replace('output_', '') for name in names]

def load(self, language_plugin_name: str, local_language_plugin: BaseLanguagePlugin = None) ‑> None

Loads in a language plugin, if available, and stores the corresponding bigramchains.

Parameters

language_plugin_name: must be the exact string of an official language plugin (see self.supported_official_language_plugin_names). If you are loading in a local plugin, the name can be anything as long as it does not conflict with an already loaded plugin name.

local_language_plugin: must be a child class of BaseLanguagePlugin: see BaseLanguagePlugin for more information on how to create a custom language plugin.

Expand source code

def load(self, language_plugin_name: str,
         local_language_plugin: BaseLanguagePlugin = None) -> None:
    """
    Loads in a language plugin, if available, and stores the corresponding bigramchains.
    Parameters:
        language_plugin_name: must be the exact string of an official language plugin (see self.supported_official_language_plugin_names). If you are loading in a local plugin, the name can be anything as long as it does not conflict with an already loaded plugin name.

        local_language_plugin: must be a child class of BaseLanguagePlugin: see BaseLanguagePlugin for more information on how to create a custom language plugin.
    """
    if local_language_plugin:
        # TODO: if someone does not pass a class INSTANCE, they get TypeError: <class 'type'> is a built-in class, this is a vague error and probably should be abstracted
        self.language_plugin_data_path = os.path.dirname(
            inspect.getfile(local_language_plugin.__class__))
        self.language_plugin_name = language_plugin_name
        language_plugin = local_language_plugin

    if local_language_plugin is None:
        if language_plugin_name not in self.supported_official_language_plugin_names:
            raise ValueError(
                "This language is not officially supported by Wuggy at this moment. If this is a local plugin, pass the local_language_plugin")
        self.language_plugin_name = language_plugin_name
        language_plugins_folder_dirname = os.path.join(
            Path(__file__).parents[1], "plugins", "language_data")
        self.language_plugin_data_path = os.path.join(
            language_plugins_folder_dirname, language_plugin_name)
        if not os.path.exists(self.language_plugin_data_path):
            self.download_language_plugin(
                language_plugin_name)
        # Official language plugins MUST have the class name "OfficialLanguagePlugin"!
        language_plugin = importlib.import_module(
            f".plugins.language_data.{language_plugin_name}.{language_plugin_name}",
            "wuggy").OfficialLanguagePlugin()

    if language_plugin_name not in self.bigramchains:
        default_data_path = os.path.join(
            self.language_plugin_data_path, language_plugin.default_data)

        data_file = codecs.open(default_data_path, 'r', encoding='utf-8')
        self.bigramchains[self.language_plugin_name] = BigramChain(
            language_plugin)
        self.bigramchains[self.language_plugin_name].load(
            data_file)
    self.__activate(self.language_plugin_name)

def lookup_reference_segments(self, reference: str) ‑> Optional[str]

Look up a given reference (word) from the currently active lookup lexicon. Returns the segments of the found word, if the word is not found it returns None. This should be used before setting a word as a reference sequence.

Expand source code

def lookup_reference_segments(self, reference: str) -> Optional[str]:
    """
    Look up a given reference (word) from the currently active lookup lexicon.
    Returns the segments of the found word, if the word is not found it returns None.
    This should be used before setting a word as a reference sequence.
    """
    return self.lookup_lexicon.get(reference, None)

def set_all_statistics(self) ‑> None

Enable all statistics supported by the current active language plugin. Enabling all statistics increases word generation computation time, especially for statistics such as ned1.

Expand source code

def set_all_statistics(self) -> None:
    """
    Enable all statistics supported by the current active language plugin.
    Enabling all statistics increases word generation computation time, especially for statistics such as ned1.
    """
    self.set_statistics(self.supported_statistics)

def set_attribute_filter(self, name: str) ‑> None

Set an attribute filter supported by the currently activated language plugin.

Expand source code

def set_attribute_filter(self, name: str) -> None:
    """
    Set an attribute filter supported by the currently activated language plugin.
    """
    reference_sequence = self.reference_sequence
    if name not in self.supported_attribute_filters:
        raise ValueError(
            f"Attribute filter {name} is not supported.")
    self.attribute_filters[name] = reference_sequence
    self.attribute_subchain = None

def set_attribute_filters(self, names: []) ‑> None

Set attribute filters supported by the currently activated language plugin.

Expand source code

def set_attribute_filters(self, names: [str]) -> None:
    """
    Set attribute filters supported by the currently activated language plugin.
    """
    for name in names:
        self.set_attribute_filter(name)

def set_frequency_filter(self, lower: int, upper: int) ‑> None

Sets the frequency filter for concentric search. Stricter search (small values for lower and upper) result in faster word generation.

Expand source code

def set_frequency_filter(self, lower: int, upper: int) -> None:
    """
    Sets the frequency filter for concentric search.
    Stricter search (small values for lower and upper) result in faster word generation.
    """
    self.frequency_filter = (self.reference_sequence, lower, upper)

def set_output_mode(self, name: str) ‑> None

Set an output mode supported by the currently activated language plugin.

Expand source code

def set_output_mode(self, name: str) -> None:
    """
    Set an output mode supported by the currently activated language plugin.
    """
    if name not in self.list_output_modes():
        raise ValueError(f"Output mode {name} is not supported.")
    self.output_mode = eval("self.language_plugin.output_%s" % (name))

def set_reference_sequence(self, sequence: str) ‑> None

Set the reference sequence. This is commonly used before generate methods in order to set the reference word for which pseudowords should be generated.

Expand source code

@_loaded_language_plugin_required
def set_reference_sequence(self, sequence: str) -> None:
    """
    Set the reference sequence.
    This is commonly used before generate methods in order to set the reference word for which pseudowords should be generated.
    """
    self.reference_sequence = self.language_plugin.transform(
        sequence).representation
    self.reference_sequence_frequencies = self.bigramchain.get_frequencies(
        self.reference_sequence)
    self.__clear_stat_cache()
    for name in self.__get_statistics():
        function = eval("self.language_plugin.statistic_%s" % (name))
        self.reference_statistics[name] = function(
            self, self.reference_sequence)

def set_statistic(self, name: str) ‑> None

Enable a statistic based on its name.

Expand source code

def set_statistic(self, name: str) -> None:
    """
    Enable a statistic based on its name.
    """
    if name not in self.supported_statistics:
        raise ValueError(f"Statistic {name} is not supported.")
    self.statistics[name] = None

def set_statistics(self, names: []) ‑> None

Enables statistics based on their names.

Expand source code

def set_statistics(self, names: [str]) -> None:
    """
    Enables statistics based on their names.
    """
    for name in names:
        if name not in self.supported_statistics:
            self.statistics = {}
            raise ValueError(f"Statistic {name} is not supported.")
        self.statistics[name] = None