User Tools

Site Tools


yivalkes:regex

REGEX SPECIFICATION

Complete rule set for direct implementation. All regex in PCRE / Python re format. Apply re.IGNORECASE everywhere.

Stem Functions

import re
 
def causer_stem(spoken: str) -> str:
    if re.search(r'[!?()]', spoken): return ''
    s = re.sub(r'^-', '', spoken).lower()
    if s.count('-') >= 2:
        return re.sub(r'-([^-]+)-.*$', r'\1', s)
    return re.sub(r'[,-].*$', '', s)
 
def median_stem(spoken: str) -> str:
    if re.search(r'[!?()]', spoken): return ''
    s = re.sub(r'^-', '', spoken).lower()
    m = re.match(r'(.*)-(.*)-(.*)', s)
    if m: return m.group(1) + m.group(3)
    m = re.match(r'(.*)-([^ ]*)(.*)', s)
    if m: return m.group(1) + m.group(2)
    return re.sub(r',.*$', '', s)

Reduplicated: dedicated onset-echo table

The Reduplicated column uses its own 16-pattern table. These are completely independent of the Actor/Passor PATTERNS list. Apply to the Causer stem: first match wins.

REDUP_PAIRS = [
    (r'^[td]h?([sz]h?)([aeiouy]*)([aeiou])',      r't\3d\1\2\3'),
    (r'^([bdgptkszfv])(h?)([lrsfzv])',             r'\1e\1\2\3'),
    (r'^([sz])(h?)([pbkgtd])',                     r's\2ez\3'),
    (r'^[fv]([pbkgtd])',                           r'fev\1'),
    (r'^[sz]([aeiou]*)([aeiou])',                  r's\2z\1\2'),
    (r'^[fv]([aeiou]*)([aeiou])',                  r'f\2v\1\2'),
    (r'^[sz]h([aeiou]*)([aeiou])',                 r'sh\2zh\1\2'),
    (r'^(h?)([uwo]*)([ou])',                       r'\1owo'),
    (r'^(h?)([iy]*)([aeoiu])',                     r'\1iya'),
    (r'^[pb]h?([aeiou]*)([aeiou])',                r'p\2b\1\2'),
    (r'^([nml])([aeiou]*)([aeiou])',               r'\1\3\1\2\3'),
    (r'^[kg]h?([aeiou]*)([aeiou])',                r'k\2g\1\2'),
    (r'^[fv]([nml])',                              r'fav\1'),
    (r'^[sz]([nml])',                              r'saz\1'),
    (r'^(h?)([ea]*)',                              r'\1ea'),
    (r'^',                                         r'hee'),   # catch-all
]
 
def apply_reduplicated(causer: str) -> str:
    for pat, repl in REDUP_PAIRS:
        if re.search(pat, causer, re.IGNORECASE):
            return re.sub(pat, repl, causer, re.IGNORECASE)
    return causer

18 Patterns (Python regex strings)

PATTERNS = [
    r'et(t?)$',                                           # [0] B
    r'([ou])y$',                                          # [1] C
    r"(y[ou]+)([sf]h?)?$",                                # [2] D
    r'([w]+)[ae]+$',                                      # [3] E
    r'([yi]+)([aeiou]+)([bdgptkmnlrfscvzjh]+)$',         # [4] F
    r'([wuo]+)([aeiou]*)([bdgptkmnrfscvzjh]+)$',         # [5] G
    r'([bdgptkmnlrfscvzjh]+)(([aeiou])([aeiou]))([lr])$', # [6] H
    r'([bdgptkmnlrfscvzjh]+)([aeiou])([lr])$',           # [7] I
    r"([bdgptkmnlrfscvzjh']+)([ea])([ea]*)$",            # [8] J
    r'([bdgptkmnlrfscvzjh]+)([aeiou]*)([ouw]+)$',        # [9] K
    r'([bdgptkmnlrfscvzjhw]+)([aeiou]*)([iy])$',         # [10] L
    r'([bdgptkmnlrfscvzjh]+)([aeiou]+)([bdgptkmnrfscvzjh]+)$',  # [11] M
    r'()([ae])y()$',                                      # [12] N
    r'(y)([ae]+)()$',                                     # [13] O
    r"(^|[''\-])([aeoiu]+)([bpkgtd]+)$",                # [14] P
    r"([''\-])([aeoiu]+)([lr])$",                        # [15] Q
    r"([''\-])([aeoiu]+)([lr][lr])$",                    # [16] R
    r'$',                                                 # [17] S catch-all
]
 
SKIP_R_INDICES = list(range(16)) + [17]   # all except [16]
ALL_INDICES    = list(range(18))           # Actor-There only

Replacement Strings per Case

# Actor-There (uses ALL_INDICES)
REPL_ACTOR_THERE  = [
    r'et\1a', r'\1ya', r"\1'a\2", r'wawa', r'\1\2\3e', r'\1\2\3e',
    r'\1\2ra', r'\1\2ra', r'\1\2wa', r'\1\2wa', r'\1\2\3ya', r'\1\2\3e',
    r'\2ye', r"\1\2\3'a", r'\1\2\3a', r'\1\2ra', r'\1\2\3a', r'a'
]
# Actor-Hither (SKIP_R_INDICES)
REPL_ACTOR_HITHER = [
    r'et\1si', r'\1yi', r"\1'i", r'\1ey', r'\1\2\3i', r'\1\2\3i',
    r'\1\2li', r'\1\2li', r'\1\2yi', r'\1\2wii', r'\1\2\3yi', r'\1\2\3i',
    r'\2yi', r"\1\2\3'i", r'\1\2\3i', r'\1\2ri', r'\1\2\3i', r'i'
]
# Actor-Hence (SKIP_R_INDICES)
REPL_ACTOR_HENCE  = [
    r'et\1soy', r'\1iyo', r'\1yo\2', r'\1oy', r'\1\2\3oy', r'\1\2\3oy',
    r'\1\2loy', r'\1\2loy', r'\1\2yo', r'\1\2\3yo', r'\1\2\3yo', r'\1\2\3oy',
    r'\2iyo', r"\1\2\3'oy", r'\1\2\3oy', r'\1\2roy', r'\1\2\3oy', r'oy'
]
# Passor-Here (SKIP_R_INDICES)
REPL_PASSOR_HERE  = [
    r'et\1', r'uy', r'yu\2', r'wee', r'ii\3', r'u\3',
    r'\1\4\5', r'\1i\3', r'\1ee', r'\1u', r'\1\2i', r'\1i\3',
    r'ey', r'yee', r'\1e\3', r'\1el', r'\1el', r''
]
# Passor-There (SKIP_R_INDICES)
REPL_PASSOR_THERE = [
    r'ayt', r'aw', r'yaw\2', r'wea', r'ya\3', r'waw\3',
    r'\1\4ra', r'\1ea\3', r'\1ewa', r'\1ua', r'\1\2ay', r'\1ea\3',
    r'ay', r'yaw', r'\1ay\3', r'\1ear', r'\1ear', r'e'
]
# Passor-Hither (SKIP_R_INDICES)
REPL_PASSOR_HITHER = [
    r'iss', r'uiii', r'iyu\2', r'wi', r'yi\3i', r'wii\3',
    r'\1\4lii', r'\1elii', r'\1ey', r'\1uwi', r'\1eye', r'\1i\3i',
    r'eyi', r'yei', r'\1i\3i', r'\1iri', r'\1iri', r'i'
]
# Passor-Hence (SKIP_R_INDICES)
REPL_PASSOR_HENCE = [
    r'oss', r'uyu', r'uyu\2', r'wu', r'ya\3u', r'wi\3o',
    r'\1\5aw', r'\1\3aw', r'\1oy', r'\1oyo', r'\1yu', r'\1o\3u',
    r'eyu', r'yu', r'\1o\3u', r'\1oru', r'\1oru', r'u'
]

Apply Functions

def apply_with_ai_at_7(q, ai, patterns, replacements, indices, flags=re.IGNORECASE):
    for idx in indices:
        if not patterns[idx]: continue
        if re.search(patterns[idx], q, flags):
            target = ai if idx == 7 else q
            return re.sub(patterns[idx], replacements[idx], target, flags=flags)
    return q
 
def apply_passor_hence(q, ai, patterns, replacements, indices, flags=re.IGNORECASE):
    for idx in indices:
        if not patterns[idx]: continue
        if idx == 7:
            if re.search(patterns[7], ai, flags):
                return re.sub(patterns[7], replacements[7], ai, flags=flags)
        else:
            if re.search(patterns[idx], q, flags):
                return re.sub(patterns[idx], replacements[idx], q, flags=flags)
    return q
 
def apply_case_ai(ai, patterns, replacements, indices, flags=re.IGNORECASE):
    for idx in indices:
        if not patterns[idx]: continue
        if re.search(patterns[idx], ai, flags):
            return re.sub(patterns[idx], replacements[idx], ai, flags=flags)
    return ai
 
def apply_person(form, pairs, flags=re.IGNORECASE):
    for pat, repl in pairs:
        if re.search(pat, form, flags):
            return re.sub(pat, repl, form, flags=flags, count=1)
    return form

Person Suffix Pairs

ME_PAIRS   = [("([ao]|[ae]e)$", r'\1ni'), ("ii$", 'iin'), ("[uw]$", 'win'), ("e?$", 'in')]
YOU_PAIRS  = [("([^aeiouwyn])$", r'\1ets'),
              ("([^aeiou]*[aeiou]+[^aeiou]+[aeiou]+[^aeiou]*[aeiou]+)$", r'\1ts'),
              ("$", 'tse')]
THEM_PAIRS = [("([aeou])$", r'\1rh'), ("([^i][wyi])$", r'\1irh'), ("$", 'erh')]

Portmanteau Table

PORTMANTEAUS = {
    ('me',   'there'):  ('in$',       'inia',   'ye'),
    ('you',  'there'):  ('tse$',      'tsa',    'a'),
    ('them', 'there'):  ('e?[ei]rh$', 'earh',   'a'),
    ('me',   'hither'): ('in$',       'inneye', 'yi'),
    ('you',  'hither'): ('tse$',      'tsi',    'i'),
    ('them', 'hither'): ('[ei]rh$',   'eyerh',  'i'),
    ('me',   'hence'):  ('in$',       'inyo',   'yo'),
    ('you',  'hence'):  ('tse$',      'tsoy',   'oy'),
    ('them', 'hence'):  ('[ei]rh$',   'iyorh',  'yo'),
}
 
def apply_portmanteau(here_person_form, person, direction, flags=re.IGNORECASE):
    pat, sub, fallback = PORTMANTEAUS[(person, direction)]
    if re.search(pat, here_person_form, flags):
        return re.sub(pat, sub, here_person_form, flags=flags)
    return here_person_form + fallback

Causative / Present Active / Cheers

CAUSATIVE_ME_PAIRS   = [("([mn]*|ng)$", 'niya'), ("([aeiouwy])$", r'\1niya'), ("$", 'iniya')]
CAUSATIVE_YOU_PAIRS  = [("(m+|ng)$", 'ntaya'), ("([dt])+$", 'ttsaya'),
                         ("([aeiourlhzsyw])$", r'\1taya'), ("([pbkg])$", r'\1saya'), ("$", 'etaya')]
CAUSATIVE_THEM_PAIRS = [("([aeiouwyrh])r?$", r'\1rheya'), ("$", 'erheya')]
PRESENT_ACTIVE_PAIRS = [("oo$", 'waam'), ("[eoa]+$", 'aam'), ("([iu])$", r'\1yaam'), ("$", 'aam')]
CHEERS_PAIRS          = [("([aeou])[iy]$", r'\1iyets!'), ("([^aeiou])$", r'\1eyets!'),
                         ("[ou]+$", 'oyets!'), ("[aei]*$", 'eyets!'), ("$", 'eyets!')]

Imperative Patterns and Replacements

IMP_PATTERNS = [
    r'^(h?([aeiouyw])|[pbvf][pb]?h?)',
    r'^[sz](h?)([aeiou])',
    r'^h?[aeiou]?(([sz])|[dt][td]?)h?',
    r'^h?[aeiou]?([gk][gk]?h?|[sz]h)',
    r'^h?[aeiou]?(([sz])|[dt][td]?)h?',
    r'^[m]([aeiou])',
    r'^h?',  # catch-all
]
 
IMP_REPLS = {
    'mild':        [r'ipp\2',    r'itts\1\2',  r'itt\2',  'ikk',   r'itt\2',  r'ibb\1',  'ippe'   ],
    'regular':     [r'epp\2',    r'etts\1\2',  r'ett\2',  'ekk',   r'ett\2',  r'ebb\1',  'eppe'   ],
    'strong':      [r'app\2',    r'atts\1\2',  r'att\2',  'akk',   r'att\2',  r'abb\1',  'appe'   ],
    'silly_int':   [r'ayopp\2',  r'ayotts\1\2',r'ayott\2','ayokk', r'ayott\2',r'ayobb\1','ayoppe' ],
    'dism_int':    [r'eumb\2',    r'eundz\1\2', r'eund\2', 'eung',  r'eund\2', r'eumb\1', 'eumbe'  ],
    'mild_int':    [r'iyepp\2',  r'iyetts\1\2',r'iyyett\2','iyyekk',r'iyyett\2',r'iyyebb\1','iyyeppe'],
    'reg_int':     [r'eyapp\2',  r'eyatts\1\2',r'eyyatt\2','eyyakk',r'eyyatt\2',r'eyyabb\1','eyyappe'],
    'most_int':    [r'ayapp\2',  r'ayatts\1\2',r'ayyatt\2','ayyakk',r'ayyatt\2',r'ayyabb\1','ayyappe'],
    'silly_imp':   [r'opp\2',    r'otts\1\2',  r'ott\2',  'okk',   r'ott\2',  r'obb\1',  'oppe'   ],
    'dism_imp':    [r'mb\2',     r'ndz\1\2',   r'nd\2',   'ng',    r'nd\2',   r'mb\1',   'mbe'    ],
}
yivalkes/regex.txt · Last modified: by wikarai