Source code for conlang.utils

from typing import List, Dict
from .phonemes import PHONEME_SET, COMMON_PHONEME_SET



[docs]
def parse_phonemes(word: str) -> List[str]:
    """
    Splits a word string into phonemes using the predefined PHONEME_SET list.

    Args:
        word (str): The word to split into phonemes as a string.

    Returns:
        List[str]: A list of phonemes.
    """
    phonemes = []
    i = 0
    while i < len(word):
        # Phonemes can be up to 3 characters long
        for j in [3, 2, 1]:
            if word[i:i + j] in PHONEME_SET:
                phonemes.append(word[i:i + j])
                i += j
                break
        else:
            phonemes.append(word[i])
            i += 1
    return phonemes




[docs]
def process_phonemes(phonemes: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Processes phoneme sets to make common phonemes more likely to be chosen.

    Args:
        phonemes (Dict[str, List[str]]): A dictionary of phoneme categories and their respective
                                         phonemes.

    Returns:
        Dict[str, List[str]]: A modified dictionary where common phonemes are more likely.
    """
    processed = {
        category: [
            phoneme for phoneme in phoneme_list
            for _ in range(2 if all(item in COMMON_PHONEME_SET for item in parse_phonemes(phoneme)) else 1)
        ]
        for category, phoneme_list in phonemes.items()
    }
    return processed




[docs]
def process_patterns(patterns: List[str], phonemes: Dict[str, List[str]]) -> List[str]:
    """
    Processes patterns to prioritize simpler ones (short and without clusters).

    Args:
        patterns (List[str]): A list of patterns to process.
        phonemes (Dict[str, List[str]]): A dictionary of phoneme categories and their respective
                                         phonemes.

    Returns:
        List[str]: A modified list of patterns where simpler patterns are more likely.
    """
    def has_cluster(pattern: str) -> bool:
        """
        Checks if a pattern contains phoneme clusters.

        Args:
            pattern (str): The pattern to check.

        Returns:
            bool: True if the pattern contains clusters, False otherwise.
        """
        for letter in pattern:
            if any(char not in PHONEME_SET for char in phonemes[letter]):
                return True
        return False

    # Prioritize simpler patterns (those without clusters)
    processed = [pattern for pattern in patterns for _ in range(
        2 if not has_cluster(pattern) else 1)]
    # Sort by length to prioritize shorter patterns
    processed = sorted(processed, key=len)
    return processed